{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from scipy.stats import skew\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('./Ames_House/Ames_House_train.csv')\n",
    "test = pd.read_csv('./Ames_House/Ames_House_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 81 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0   1          60       RL         65.0     8450   Pave   NaN      Reg   \n",
       "1   2          20       RL         80.0     9600   Pave   NaN      Reg   \n",
       "2   3          60       RL         68.0    11250   Pave   NaN      IR1   \n",
       "3   4          70       RL         60.0     9550   Pave   NaN      IR1   \n",
       "4   5          60       RL         84.0    14260   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \\\n",
       "0         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "1         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "2         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "3         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "4         Lvl    AllPub    ...            0    NaN   NaN         NaN       0   \n",
       "\n",
       "  MoSold YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0      2   2008        WD         Normal     208500  \n",
       "1      5   2007        WD         Normal     181500  \n",
       "2      9   2008        WD         Normal     223500  \n",
       "3      2   2006        WD        Abnorml     140000  \n",
       "4     12   2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 81 columns]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "train.shape: (1460, 81)\n"
     ]
    }
   ],
   "source": [
    "print('train.shape: ' + str(train.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>...</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1461</td>\n",
       "      <td>20</td>\n",
       "      <td>RH</td>\n",
       "      <td>80.0</td>\n",
       "      <td>11622</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>120</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1462</td>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>81.0</td>\n",
       "      <td>14267</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Gar2</td>\n",
       "      <td>12500</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1463</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>74.0</td>\n",
       "      <td>13830</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>MnPrv</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1464</td>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>78.0</td>\n",
       "      <td>9978</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>6</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1465</td>\n",
       "      <td>120</td>\n",
       "      <td>RL</td>\n",
       "      <td>43.0</td>\n",
       "      <td>5005</td>\n",
       "      <td>Pave</td>\n",
       "      <td>NaN</td>\n",
       "      <td>IR1</td>\n",
       "      <td>HLS</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>...</td>\n",
       "      <td>144</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2010</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0  1461          20       RH         80.0    11622   Pave   NaN      Reg   \n",
       "1  1462          20       RL         81.0    14267   Pave   NaN      IR1   \n",
       "2  1463          60       RL         74.0    13830   Pave   NaN      IR1   \n",
       "3  1464          60       RL         78.0     9978   Pave   NaN      IR1   \n",
       "4  1465         120       RL         43.0     5005   Pave   NaN      IR1   \n",
       "\n",
       "  LandContour Utilities      ...       ScreenPorch PoolArea PoolQC  Fence  \\\n",
       "0         Lvl    AllPub      ...               120        0    NaN  MnPrv   \n",
       "1         Lvl    AllPub      ...                 0        0    NaN    NaN   \n",
       "2         Lvl    AllPub      ...                 0        0    NaN  MnPrv   \n",
       "3         Lvl    AllPub      ...                 0        0    NaN    NaN   \n",
       "4         HLS    AllPub      ...               144        0    NaN    NaN   \n",
       "\n",
       "  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  \n",
       "0         NaN       0      6    2010        WD         Normal  \n",
       "1        Gar2   12500      6    2010        WD         Normal  \n",
       "2         NaN       0      3    2010        WD         Normal  \n",
       "3         NaN       0      6    2010        WD         Normal  \n",
       "4         NaN       0      1    2010        WD         Normal  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "test.shape: (1459, 80)\n"
     ]
    }
   ],
   "source": [
    "print('test.shape: ' + str(test.shape))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1460 entries, 0 to 1459\n",
      "Data columns (total 81 columns):\n",
      "Id               1460 non-null int64\n",
      "MSSubClass       1460 non-null int64\n",
      "MSZoning         1460 non-null object\n",
      "LotFrontage      1201 non-null float64\n",
      "LotArea          1460 non-null int64\n",
      "Street           1460 non-null object\n",
      "Alley            91 non-null object\n",
      "LotShape         1460 non-null object\n",
      "LandContour      1460 non-null object\n",
      "Utilities        1460 non-null object\n",
      "LotConfig        1460 non-null object\n",
      "LandSlope        1460 non-null object\n",
      "Neighborhood     1460 non-null object\n",
      "Condition1       1460 non-null object\n",
      "Condition2       1460 non-null object\n",
      "BldgType         1460 non-null object\n",
      "HouseStyle       1460 non-null object\n",
      "OverallQual      1460 non-null int64\n",
      "OverallCond      1460 non-null int64\n",
      "YearBuilt        1460 non-null int64\n",
      "YearRemodAdd     1460 non-null int64\n",
      "RoofStyle        1460 non-null object\n",
      "RoofMatl         1460 non-null object\n",
      "Exterior1st      1460 non-null object\n",
      "Exterior2nd      1460 non-null object\n",
      "MasVnrType       1452 non-null object\n",
      "MasVnrArea       1452 non-null float64\n",
      "ExterQual        1460 non-null object\n",
      "ExterCond        1460 non-null object\n",
      "Foundation       1460 non-null object\n",
      "BsmtQual         1423 non-null object\n",
      "BsmtCond         1423 non-null object\n",
      "BsmtExposure     1422 non-null object\n",
      "BsmtFinType1     1423 non-null object\n",
      "BsmtFinSF1       1460 non-null int64\n",
      "BsmtFinType2     1422 non-null object\n",
      "BsmtFinSF2       1460 non-null int64\n",
      "BsmtUnfSF        1460 non-null int64\n",
      "TotalBsmtSF      1460 non-null int64\n",
      "Heating          1460 non-null object\n",
      "HeatingQC        1460 non-null object\n",
      "CentralAir       1460 non-null object\n",
      "Electrical       1459 non-null object\n",
      "1stFlrSF         1460 non-null int64\n",
      "2ndFlrSF         1460 non-null int64\n",
      "LowQualFinSF     1460 non-null int64\n",
      "GrLivArea        1460 non-null int64\n",
      "BsmtFullBath     1460 non-null int64\n",
      "BsmtHalfBath     1460 non-null int64\n",
      "FullBath         1460 non-null int64\n",
      "HalfBath         1460 non-null int64\n",
      "BedroomAbvGr     1460 non-null int64\n",
      "KitchenAbvGr     1460 non-null int64\n",
      "KitchenQual      1460 non-null object\n",
      "TotRmsAbvGrd     1460 non-null int64\n",
      "Functional       1460 non-null object\n",
      "Fireplaces       1460 non-null int64\n",
      "FireplaceQu      770 non-null object\n",
      "GarageType       1379 non-null object\n",
      "GarageYrBlt      1379 non-null float64\n",
      "GarageFinish     1379 non-null object\n",
      "GarageCars       1460 non-null int64\n",
      "GarageArea       1460 non-null int64\n",
      "GarageQual       1379 non-null object\n",
      "GarageCond       1379 non-null object\n",
      "PavedDrive       1460 non-null object\n",
      "WoodDeckSF       1460 non-null int64\n",
      "OpenPorchSF      1460 non-null int64\n",
      "EnclosedPorch    1460 non-null int64\n",
      "3SsnPorch        1460 non-null int64\n",
      "ScreenPorch      1460 non-null int64\n",
      "PoolArea         1460 non-null int64\n",
      "PoolQC           7 non-null object\n",
      "Fence            281 non-null object\n",
      "MiscFeature      54 non-null object\n",
      "MiscVal          1460 non-null int64\n",
      "MoSold           1460 non-null int64\n",
      "YrSold           1460 non-null int64\n",
      "SaleType         1460 non-null object\n",
      "SaleCondition    1460 non-null object\n",
      "SalePrice        1460 non-null int64\n",
      "dtypes: float64(3), int64(35), object(43)\n",
      "memory usage: 924.0+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "RangeIndex: 1459 entries, 0 to 1458\n",
      "Data columns (total 80 columns):\n",
      "Id               1459 non-null int64\n",
      "MSSubClass       1459 non-null int64\n",
      "MSZoning         1455 non-null object\n",
      "LotFrontage      1232 non-null float64\n",
      "LotArea          1459 non-null int64\n",
      "Street           1459 non-null object\n",
      "Alley            107 non-null object\n",
      "LotShape         1459 non-null object\n",
      "LandContour      1459 non-null object\n",
      "Utilities        1457 non-null object\n",
      "LotConfig        1459 non-null object\n",
      "LandSlope        1459 non-null object\n",
      "Neighborhood     1459 non-null object\n",
      "Condition1       1459 non-null object\n",
      "Condition2       1459 non-null object\n",
      "BldgType         1459 non-null object\n",
      "HouseStyle       1459 non-null object\n",
      "OverallQual      1459 non-null int64\n",
      "OverallCond      1459 non-null int64\n",
      "YearBuilt        1459 non-null int64\n",
      "YearRemodAdd     1459 non-null int64\n",
      "RoofStyle        1459 non-null object\n",
      "RoofMatl         1459 non-null object\n",
      "Exterior1st      1458 non-null object\n",
      "Exterior2nd      1458 non-null object\n",
      "MasVnrType       1443 non-null object\n",
      "MasVnrArea       1444 non-null float64\n",
      "ExterQual        1459 non-null object\n",
      "ExterCond        1459 non-null object\n",
      "Foundation       1459 non-null object\n",
      "BsmtQual         1415 non-null object\n",
      "BsmtCond         1414 non-null object\n",
      "BsmtExposure     1415 non-null object\n",
      "BsmtFinType1     1417 non-null object\n",
      "BsmtFinSF1       1458 non-null float64\n",
      "BsmtFinType2     1417 non-null object\n",
      "BsmtFinSF2       1458 non-null float64\n",
      "BsmtUnfSF        1458 non-null float64\n",
      "TotalBsmtSF      1458 non-null float64\n",
      "Heating          1459 non-null object\n",
      "HeatingQC        1459 non-null object\n",
      "CentralAir       1459 non-null object\n",
      "Electrical       1459 non-null object\n",
      "1stFlrSF         1459 non-null int64\n",
      "2ndFlrSF         1459 non-null int64\n",
      "LowQualFinSF     1459 non-null int64\n",
      "GrLivArea        1459 non-null int64\n",
      "BsmtFullBath     1457 non-null float64\n",
      "BsmtHalfBath     1457 non-null float64\n",
      "FullBath         1459 non-null int64\n",
      "HalfBath         1459 non-null int64\n",
      "BedroomAbvGr     1459 non-null int64\n",
      "KitchenAbvGr     1459 non-null int64\n",
      "KitchenQual      1458 non-null object\n",
      "TotRmsAbvGrd     1459 non-null int64\n",
      "Functional       1457 non-null object\n",
      "Fireplaces       1459 non-null int64\n",
      "FireplaceQu      729 non-null object\n",
      "GarageType       1383 non-null object\n",
      "GarageYrBlt      1381 non-null float64\n",
      "GarageFinish     1381 non-null object\n",
      "GarageCars       1458 non-null float64\n",
      "GarageArea       1458 non-null float64\n",
      "GarageQual       1381 non-null object\n",
      "GarageCond       1381 non-null object\n",
      "PavedDrive       1459 non-null object\n",
      "WoodDeckSF       1459 non-null int64\n",
      "OpenPorchSF      1459 non-null int64\n",
      "EnclosedPorch    1459 non-null int64\n",
      "3SsnPorch        1459 non-null int64\n",
      "ScreenPorch      1459 non-null int64\n",
      "PoolArea         1459 non-null int64\n",
      "PoolQC           3 non-null object\n",
      "Fence            290 non-null object\n",
      "MiscFeature      51 non-null object\n",
      "MiscVal          1459 non-null int64\n",
      "MoSold           1459 non-null int64\n",
      "YrSold           1459 non-null int64\n",
      "SaleType         1458 non-null object\n",
      "SaleCondition    1459 non-null object\n",
      "dtypes: float64(11), int64(26), object(43)\n",
      "memory usage: 912.0+ KB\n"
     ]
    }
   ],
   "source": [
    "test.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Id</th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>OverallQual</th>\n",
       "      <th>OverallCond</th>\n",
       "      <th>YearBuilt</th>\n",
       "      <th>YearRemodAdd</th>\n",
       "      <th>MasVnrArea</th>\n",
       "      <th>BsmtFinSF1</th>\n",
       "      <th>...</th>\n",
       "      <th>WoodDeckSF</th>\n",
       "      <th>OpenPorchSF</th>\n",
       "      <th>EnclosedPorch</th>\n",
       "      <th>3SsnPorch</th>\n",
       "      <th>ScreenPorch</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1201.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1452.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "      <td>1460.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>730.500000</td>\n",
       "      <td>56.897260</td>\n",
       "      <td>70.049958</td>\n",
       "      <td>10516.828082</td>\n",
       "      <td>6.099315</td>\n",
       "      <td>5.575342</td>\n",
       "      <td>1971.267808</td>\n",
       "      <td>1984.865753</td>\n",
       "      <td>103.685262</td>\n",
       "      <td>443.639726</td>\n",
       "      <td>...</td>\n",
       "      <td>94.244521</td>\n",
       "      <td>46.660274</td>\n",
       "      <td>21.954110</td>\n",
       "      <td>3.409589</td>\n",
       "      <td>15.060959</td>\n",
       "      <td>2.758904</td>\n",
       "      <td>43.489041</td>\n",
       "      <td>6.321918</td>\n",
       "      <td>2007.815753</td>\n",
       "      <td>180921.195890</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>421.610009</td>\n",
       "      <td>42.300571</td>\n",
       "      <td>24.284752</td>\n",
       "      <td>9981.264932</td>\n",
       "      <td>1.382997</td>\n",
       "      <td>1.112799</td>\n",
       "      <td>30.202904</td>\n",
       "      <td>20.645407</td>\n",
       "      <td>181.066207</td>\n",
       "      <td>456.098091</td>\n",
       "      <td>...</td>\n",
       "      <td>125.338794</td>\n",
       "      <td>66.256028</td>\n",
       "      <td>61.119149</td>\n",
       "      <td>29.317331</td>\n",
       "      <td>55.757415</td>\n",
       "      <td>40.177307</td>\n",
       "      <td>496.123024</td>\n",
       "      <td>2.703626</td>\n",
       "      <td>1.328095</td>\n",
       "      <td>79442.502883</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>1.000000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>1300.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>1872.000000</td>\n",
       "      <td>1950.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>2006.000000</td>\n",
       "      <td>34900.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>365.750000</td>\n",
       "      <td>20.000000</td>\n",
       "      <td>59.000000</td>\n",
       "      <td>7553.500000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1954.000000</td>\n",
       "      <td>1967.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>2007.000000</td>\n",
       "      <td>129975.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>730.500000</td>\n",
       "      <td>50.000000</td>\n",
       "      <td>69.000000</td>\n",
       "      <td>9478.500000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>5.000000</td>\n",
       "      <td>1973.000000</td>\n",
       "      <td>1994.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>383.500000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>25.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2008.000000</td>\n",
       "      <td>163000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>1095.250000</td>\n",
       "      <td>70.000000</td>\n",
       "      <td>80.000000</td>\n",
       "      <td>11601.500000</td>\n",
       "      <td>7.000000</td>\n",
       "      <td>6.000000</td>\n",
       "      <td>2000.000000</td>\n",
       "      <td>2004.000000</td>\n",
       "      <td>166.000000</td>\n",
       "      <td>712.250000</td>\n",
       "      <td>...</td>\n",
       "      <td>168.000000</td>\n",
       "      <td>68.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>2009.000000</td>\n",
       "      <td>214000.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>1460.000000</td>\n",
       "      <td>190.000000</td>\n",
       "      <td>313.000000</td>\n",
       "      <td>215245.000000</td>\n",
       "      <td>10.000000</td>\n",
       "      <td>9.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>1600.000000</td>\n",
       "      <td>5644.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>857.000000</td>\n",
       "      <td>547.000000</td>\n",
       "      <td>552.000000</td>\n",
       "      <td>508.000000</td>\n",
       "      <td>480.000000</td>\n",
       "      <td>738.000000</td>\n",
       "      <td>15500.000000</td>\n",
       "      <td>12.000000</td>\n",
       "      <td>2010.000000</td>\n",
       "      <td>755000.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8 rows × 38 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \\\n",
       "count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   \n",
       "mean    730.500000    56.897260    70.049958   10516.828082     6.099315   \n",
       "std     421.610009    42.300571    24.284752    9981.264932     1.382997   \n",
       "min       1.000000    20.000000    21.000000    1300.000000     1.000000   \n",
       "25%     365.750000    20.000000    59.000000    7553.500000     5.000000   \n",
       "50%     730.500000    50.000000    69.000000    9478.500000     6.000000   \n",
       "75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   \n",
       "max    1460.000000   190.000000   313.000000  215245.000000    10.000000   \n",
       "\n",
       "       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  \\\n",
       "count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000   \n",
       "mean      5.575342  1971.267808   1984.865753   103.685262   443.639726   \n",
       "std       1.112799    30.202904     20.645407   181.066207   456.098091   \n",
       "min       1.000000  1872.000000   1950.000000     0.000000     0.000000   \n",
       "25%       5.000000  1954.000000   1967.000000     0.000000     0.000000   \n",
       "50%       5.000000  1973.000000   1994.000000     0.000000   383.500000   \n",
       "75%       6.000000  2000.000000   2004.000000   166.000000   712.250000   \n",
       "max       9.000000  2010.000000   2010.000000  1600.000000  5644.000000   \n",
       "\n",
       "           ...         WoodDeckSF  OpenPorchSF  EnclosedPorch    3SsnPorch  \\\n",
       "count      ...        1460.000000  1460.000000    1460.000000  1460.000000   \n",
       "mean       ...          94.244521    46.660274      21.954110     3.409589   \n",
       "std        ...         125.338794    66.256028      61.119149    29.317331   \n",
       "min        ...           0.000000     0.000000       0.000000     0.000000   \n",
       "25%        ...           0.000000     0.000000       0.000000     0.000000   \n",
       "50%        ...           0.000000    25.000000       0.000000     0.000000   \n",
       "75%        ...         168.000000    68.000000       0.000000     0.000000   \n",
       "max        ...         857.000000   547.000000     552.000000   508.000000   \n",
       "\n",
       "       ScreenPorch     PoolArea       MiscVal       MoSold       YrSold  \\\n",
       "count  1460.000000  1460.000000   1460.000000  1460.000000  1460.000000   \n",
       "mean     15.060959     2.758904     43.489041     6.321918  2007.815753   \n",
       "std      55.757415    40.177307    496.123024     2.703626     1.328095   \n",
       "min       0.000000     0.000000      0.000000     1.000000  2006.000000   \n",
       "25%       0.000000     0.000000      0.000000     5.000000  2007.000000   \n",
       "50%       0.000000     0.000000      0.000000     6.000000  2008.000000   \n",
       "75%       0.000000     0.000000      0.000000     8.000000  2009.000000   \n",
       "max     480.000000   738.000000  15500.000000    12.000000  2010.000000   \n",
       "\n",
       "           SalePrice  \n",
       "count    1460.000000  \n",
       "mean   180921.195890  \n",
       "std     79442.502883  \n",
       "min     34900.000000  \n",
       "25%    129975.000000  \n",
       "50%    163000.000000  \n",
       "75%    214000.000000  \n",
       "max    755000.000000  \n",
       "\n",
       "[8 rows x 38 columns]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "MSZoning属性的不同取值和出现的次数\n",
      "RL         1151\n",
      "RM          218\n",
      "FV           65\n",
      "RH           16\n",
      "C (all)      10\n",
      "Name: MSZoning, dtype: int64\n",
      "\n",
      "Street属性的不同取值和出现的次数\n",
      "Pave    1454\n",
      "Grvl       6\n",
      "Name: Street, dtype: int64\n",
      "\n",
      "Alley属性的不同取值和出现的次数\n",
      "Grvl    50\n",
      "Pave    41\n",
      "Name: Alley, dtype: int64\n",
      "\n",
      "LotShape属性的不同取值和出现的次数\n",
      "Reg    925\n",
      "IR1    484\n",
      "IR2     41\n",
      "IR3     10\n",
      "Name: LotShape, dtype: int64\n",
      "\n",
      "LandContour属性的不同取值和出现的次数\n",
      "Lvl    1311\n",
      "Bnk      63\n",
      "HLS      50\n",
      "Low      36\n",
      "Name: LandContour, dtype: int64\n",
      "\n",
      "Utilities属性的不同取值和出现的次数\n",
      "AllPub    1459\n",
      "NoSeWa       1\n",
      "Name: Utilities, dtype: int64\n",
      "\n",
      "LotConfig属性的不同取值和出现的次数\n",
      "Inside     1052\n",
      "Corner      263\n",
      "CulDSac      94\n",
      "FR2          47\n",
      "FR3           4\n",
      "Name: LotConfig, dtype: int64\n",
      "\n",
      "LandSlope属性的不同取值和出现的次数\n",
      "Gtl    1382\n",
      "Mod      65\n",
      "Sev      13\n",
      "Name: LandSlope, dtype: int64\n",
      "\n",
      "Neighborhood属性的不同取值和出现的次数\n",
      "NAmes      225\n",
      "CollgCr    150\n",
      "OldTown    113\n",
      "Edwards    100\n",
      "Somerst     86\n",
      "Gilbert     79\n",
      "NridgHt     77\n",
      "Sawyer      74\n",
      "NWAmes      73\n",
      "SawyerW     59\n",
      "BrkSide     58\n",
      "Crawfor     51\n",
      "Mitchel     49\n",
      "NoRidge     41\n",
      "Timber      38\n",
      "IDOTRR      37\n",
      "ClearCr     28\n",
      "StoneBr     25\n",
      "SWISU       25\n",
      "Blmngtn     17\n",
      "MeadowV     17\n",
      "BrDale      16\n",
      "Veenker     11\n",
      "NPkVill      9\n",
      "Blueste      2\n",
      "Name: Neighborhood, dtype: int64\n",
      "\n",
      "Condition1属性的不同取值和出现的次数\n",
      "Norm      1260\n",
      "Feedr       81\n",
      "Artery      48\n",
      "RRAn        26\n",
      "PosN        19\n",
      "RRAe        11\n",
      "PosA         8\n",
      "RRNn         5\n",
      "RRNe         2\n",
      "Name: Condition1, dtype: int64\n",
      "\n",
      "Condition2属性的不同取值和出现的次数\n",
      "Norm      1445\n",
      "Feedr        6\n",
      "PosN         2\n",
      "Artery       2\n",
      "RRNn         2\n",
      "RRAe         1\n",
      "RRAn         1\n",
      "PosA         1\n",
      "Name: Condition2, dtype: int64\n",
      "\n",
      "BldgType属性的不同取值和出现的次数\n",
      "1Fam      1220\n",
      "TwnhsE     114\n",
      "Duplex      52\n",
      "Twnhs       43\n",
      "2fmCon      31\n",
      "Name: BldgType, dtype: int64\n",
      "\n",
      "HouseStyle属性的不同取值和出现的次数\n",
      "1Story    726\n",
      "2Story    445\n",
      "1.5Fin    154\n",
      "SLvl       65\n",
      "SFoyer     37\n",
      "1.5Unf     14\n",
      "2.5Unf     11\n",
      "2.5Fin      8\n",
      "Name: HouseStyle, dtype: int64\n",
      "\n",
      "RoofStyle属性的不同取值和出现的次数\n",
      "Gable      1141\n",
      "Hip         286\n",
      "Flat         13\n",
      "Gambrel      11\n",
      "Mansard       7\n",
      "Shed          2\n",
      "Name: RoofStyle, dtype: int64\n",
      "\n",
      "RoofMatl属性的不同取值和出现的次数\n",
      "CompShg    1434\n",
      "Tar&Grv      11\n",
      "WdShngl       6\n",
      "WdShake       5\n",
      "Roll          1\n",
      "Metal         1\n",
      "ClyTile       1\n",
      "Membran       1\n",
      "Name: RoofMatl, dtype: int64\n",
      "\n",
      "Exterior1st属性的不同取值和出现的次数\n",
      "VinylSd    515\n",
      "HdBoard    222\n",
      "MetalSd    220\n",
      "Wd Sdng    206\n",
      "Plywood    108\n",
      "CemntBd     61\n",
      "BrkFace     50\n",
      "WdShing     26\n",
      "Stucco      25\n",
      "AsbShng     20\n",
      "Stone        2\n",
      "BrkComm      2\n",
      "ImStucc      1\n",
      "CBlock       1\n",
      "AsphShn      1\n",
      "Name: Exterior1st, dtype: int64\n",
      "\n",
      "Exterior2nd属性的不同取值和出现的次数\n",
      "VinylSd    504\n",
      "MetalSd    214\n",
      "HdBoard    207\n",
      "Wd Sdng    197\n",
      "Plywood    142\n",
      "CmentBd     60\n",
      "Wd Shng     38\n",
      "Stucco      26\n",
      "BrkFace     25\n",
      "AsbShng     20\n",
      "ImStucc     10\n",
      "Brk Cmn      7\n",
      "Stone        5\n",
      "AsphShn      3\n",
      "Other        1\n",
      "CBlock       1\n",
      "Name: Exterior2nd, dtype: int64\n",
      "\n",
      "MasVnrType属性的不同取值和出现的次数\n",
      "None       864\n",
      "BrkFace    445\n",
      "Stone      128\n",
      "BrkCmn      15\n",
      "Name: MasVnrType, dtype: int64\n",
      "\n",
      "ExterQual属性的不同取值和出现的次数\n",
      "TA    906\n",
      "Gd    488\n",
      "Ex     52\n",
      "Fa     14\n",
      "Name: ExterQual, dtype: int64\n",
      "\n",
      "ExterCond属性的不同取值和出现的次数\n",
      "TA    1282\n",
      "Gd     146\n",
      "Fa      28\n",
      "Ex       3\n",
      "Po       1\n",
      "Name: ExterCond, dtype: int64\n",
      "\n",
      "Foundation属性的不同取值和出现的次数\n",
      "PConc     647\n",
      "CBlock    634\n",
      "BrkTil    146\n",
      "Slab       24\n",
      "Stone       6\n",
      "Wood        3\n",
      "Name: Foundation, dtype: int64\n",
      "\n",
      "BsmtQual属性的不同取值和出现的次数\n",
      "TA    649\n",
      "Gd    618\n",
      "Ex    121\n",
      "Fa     35\n",
      "Name: BsmtQual, dtype: int64\n",
      "\n",
      "BsmtCond属性的不同取值和出现的次数\n",
      "TA    1311\n",
      "Gd      65\n",
      "Fa      45\n",
      "Po       2\n",
      "Name: BsmtCond, dtype: int64\n",
      "\n",
      "BsmtExposure属性的不同取值和出现的次数\n",
      "No    953\n",
      "Av    221\n",
      "Gd    134\n",
      "Mn    114\n",
      "Name: BsmtExposure, dtype: int64\n",
      "\n",
      "BsmtFinType1属性的不同取值和出现的次数\n",
      "Unf    430\n",
      "GLQ    418\n",
      "ALQ    220\n",
      "BLQ    148\n",
      "Rec    133\n",
      "LwQ     74\n",
      "Name: BsmtFinType1, dtype: int64\n",
      "\n",
      "BsmtFinType2属性的不同取值和出现的次数\n",
      "Unf    1256\n",
      "Rec      54\n",
      "LwQ      46\n",
      "BLQ      33\n",
      "ALQ      19\n",
      "GLQ      14\n",
      "Name: BsmtFinType2, dtype: int64\n",
      "\n",
      "Heating属性的不同取值和出现的次数\n",
      "GasA     1428\n",
      "GasW       18\n",
      "Grav        7\n",
      "Wall        4\n",
      "OthW        2\n",
      "Floor       1\n",
      "Name: Heating, dtype: int64\n",
      "\n",
      "HeatingQC属性的不同取值和出现的次数\n",
      "Ex    741\n",
      "TA    428\n",
      "Gd    241\n",
      "Fa     49\n",
      "Po      1\n",
      "Name: HeatingQC, dtype: int64\n",
      "\n",
      "CentralAir属性的不同取值和出现的次数\n",
      "Y    1365\n",
      "N      95\n",
      "Name: CentralAir, dtype: int64\n",
      "\n",
      "Electrical属性的不同取值和出现的次数\n",
      "SBrkr    1334\n",
      "FuseA      94\n",
      "FuseF      27\n",
      "FuseP       3\n",
      "Mix         1\n",
      "Name: Electrical, dtype: int64\n",
      "\n",
      "KitchenQual属性的不同取值和出现的次数\n",
      "TA    735\n",
      "Gd    586\n",
      "Ex    100\n",
      "Fa     39\n",
      "Name: KitchenQual, dtype: int64\n",
      "\n",
      "Functional属性的不同取值和出现的次数\n",
      "Typ     1360\n",
      "Min2      34\n",
      "Min1      31\n",
      "Mod       15\n",
      "Maj1      14\n",
      "Maj2       5\n",
      "Sev        1\n",
      "Name: Functional, dtype: int64\n",
      "\n",
      "FireplaceQu属性的不同取值和出现的次数\n",
      "Gd    380\n",
      "TA    313\n",
      "Fa     33\n",
      "Ex     24\n",
      "Po     20\n",
      "Name: FireplaceQu, dtype: int64\n",
      "\n",
      "GarageType属性的不同取值和出现的次数\n",
      "Attchd     870\n",
      "Detchd     387\n",
      "BuiltIn     88\n",
      "Basment     19\n",
      "CarPort      9\n",
      "2Types       6\n",
      "Name: GarageType, dtype: int64\n",
      "\n",
      "GarageFinish属性的不同取值和出现的次数\n",
      "Unf    605\n",
      "RFn    422\n",
      "Fin    352\n",
      "Name: GarageFinish, dtype: int64\n",
      "\n",
      "GarageQual属性的不同取值和出现的次数\n",
      "TA    1311\n",
      "Fa      48\n",
      "Gd      14\n",
      "Po       3\n",
      "Ex       3\n",
      "Name: GarageQual, dtype: int64\n",
      "\n",
      "GarageCond属性的不同取值和出现的次数\n",
      "TA    1326\n",
      "Fa      35\n",
      "Gd       9\n",
      "Po       7\n",
      "Ex       2\n",
      "Name: GarageCond, dtype: int64\n",
      "\n",
      "PavedDrive属性的不同取值和出现的次数\n",
      "Y    1340\n",
      "N      90\n",
      "P      30\n",
      "Name: PavedDrive, dtype: int64\n",
      "\n",
      "PoolQC属性的不同取值和出现的次数\n",
      "Gd    3\n",
      "Ex    2\n",
      "Fa    2\n",
      "Name: PoolQC, dtype: int64\n",
      "\n",
      "Fence属性的不同取值和出现的次数\n",
      "MnPrv    157\n",
      "GdPrv     59\n",
      "GdWo      54\n",
      "MnWw      11\n",
      "Name: Fence, dtype: int64\n",
      "\n",
      "MiscFeature属性的不同取值和出现的次数\n",
      "Shed    49\n",
      "Gar2     2\n",
      "Othr     2\n",
      "TenC     1\n",
      "Name: MiscFeature, dtype: int64\n",
      "\n",
      "SaleType属性的不同取值和出现的次数\n",
      "WD       1267\n",
      "New       122\n",
      "COD        43\n",
      "ConLD       9\n",
      "ConLw       5\n",
      "ConLI       5\n",
      "CWD         4\n",
      "Oth         3\n",
      "Con         2\n",
      "Name: SaleType, dtype: int64\n",
      "\n",
      "SaleCondition属性的不同取值和出现的次数\n",
      "Normal     1198\n",
      "Partial     125\n",
      "Abnorml     101\n",
      "Family       20\n",
      "Alloca       12\n",
      "AdjLand       4\n",
      "Name: SaleCondition, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "categorical_features = train.select_dtypes(include=['object']).columns\n",
    "for col in categorical_features:\n",
    "    print('\\n%s属性的不同取值和出现的次数'%col)\n",
    "    print(train[col].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.drop(['Id'], inplace=True, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_id = test['Id']\n",
    "test.drop(['Id'], inplace=True, axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZsAAAEWCAYAAACwtjr+AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvpW3flQAAIABJREFUeJzt3Xu8XWV95/HP13CtJCbAaYokNVijNs5MMZxCvIwXrCHBS2gHKYzTRIaSqbdXfbVThXFqEG3HXkYro2JRGBJvgKgltdAYgeroq4GccI9IcxAyJAI5EuBE6Yjib/5YzyYrm307+5y1195rf9+v13qdtZ51eZ51crJ/+3nW8zxLEYGZmVmRnlV2AczMrPocbMzMrHAONmZmVjgHGzMzK5yDjZmZFc7BxszMCudgY1ZH0j9J+v0m+66TtKagfN8u6WFJP5Z0VBF5dEvSayTtym1vl/SaEotkA+agsgtgNh2S7gd+PyK+2Yv8ImJlEdeVdDDwUWBZRNxeRB5TLE8AiyNivNH+iHhJj4tkA841G7P+MB84DNg+1ROVGYj/y5L8BXdIDcQfqFk3JJ0raVzSXkkbJT03t+/lkrZKejz9fHmTaxwj6Q5Jf5K2n25ik/Q2Sd+R9NeSHpV0n6SVuXOPk/RtSfskfVPSJyV9vkEeLwTuSZuPSbqhXRlTOf5M0neBJ4DnN7jur6fjHkvNXm+uO//3c9tvk/SdtP7tlHx7atL73QbXvl/Sb6X1Z0k6T9K9kh6RdJWkI9O+RZJC0jmS/i9wg6TDJH0+HftYurf5jX7/Vh0ONlZJkk4G/gdwBnAMsBO4Iu07EvgH4CLgKLLmq3+of04i6TjgW8AnIuKvmmR1ElmgOBr4S+BSSUr7vgjcnPK4APi9RheIiH8Bas1ScyPi5A7L+HvAWmB2ur982Q8G/h74BvDLwLuBL0h6UZP7yJfnVWn1NyLiiIi4ss0p7wZOA14NPBd4FPhk3TGvBn4dOAVYAzwHWJju7Q+Af21XLhtsDjZWVW8FLouIWyLip8D5wMskLQLeAOyIiM9FxM8j4kvA94E35c5fAtwIrIuIS1rkszMiPhMRTwHryQLbfEm/Cvwm8IGIeDIivgNsnEL5Oynj5RGxPe3/Wd35y4AjgI+k/G8Avg6cNYUydOoPgPdHxK70u74AOL2uyeyCiPhJRPwr8DOyIPOCiHgqIrZFxGQB5bI+4mBjVfVcct/2I+LHwCPAsfX7kp1pX81bgd3A1W3yeSiXxxNp9YiUx95cGsAD3Za/SRlbXe+5wAMR8YsW58+U5wFfS01ijwF3A0+RPYeqyZf1c8Am4ApJP5T0l6kmZhXmYGNV9UOyD0EAJD2b7Nv07vp9ya+mfTUXAD8CvihpVhf5PwgcKemXcmkLp3B+J2VsNWX7D4GFdR0H8uf/BMiX7VemULZ6DwArI2JubjksIhqWNSJ+FhEfjIglwMuBNwKrp5G/DQAHG6uCg9ND59pyEPAl4GxJx0s6FPhz4KaIuB+4FnihpP8o6aD0AHwJWTNTzc+AtwDPBjZMtbdXROwExoALJB0i6WUc2ATWTidlbOUmso4D75V0cBoT8ybScyvgNuB3JP2SpBcA59Sd/zANOh008WngzyQ9D0DSiKRVzQ6W9FpJ/zYF8Umy3/Uvmh1v1eBgY1VwLdkD5tpyQRp386fAV8hqGb8GnAkQEY+QfZv+Y7KmtfcCb4yIH+UvGhFPAr9D1hx0WRfdi98KvCzl8WHgSuCnnZzYaRlbnP8kWXBZSVZD+xSwOiK+nw75GPAkWVBZD3yh7hIXAOtT09gZbbL7ONnzqG9I2gdsIes40cyvkDVPTpI1uX2LrGnNKkx+eZpZb0i6Evh+RKwruyxmveaajVlBJP2mpF9L41BWAKuAvyu7XGZl8Ghes+L8CvBVso4Ju4C3R8St5RbJrBxuRjMzs8K5Gc3MzArnZrTk6KOPjkWLFpVdDDOzgbJt27YfRcRIu+McbJJFixYxNjZWdjHMzAaKpPqZLhpyM5qZmRXOwcbMzArnYGNmZoVzsDEzs8I52JiZWeEcbMxs4M2ZA9Izlzlzyi6Z1TjYmNnA27dvaunWew42ZmZWOAcbMxsabm4rj4ONmQ0NN7eVx8HGzMwK52BjZgNv9uyppVvveSJOMxt4k5Nll8Dacc3GzMwK52BjZkPDzW3lcTOamQ0NN7eVxzUbMzMrnIONmZkVzsHGzMwK52BjZmaFKyzYSHqRpNtyy6Sk90g6UtJmSTvSz3npeEm6SNK4pDskLc1da006foekNbn0EyTdmc65SJJSesM8zMysHIUFm4i4JyKOj4jjgROAJ4CvAecB10fEYuD6tA2wEliclrXAxZAFDmAdcBJwIrAuFzwuBs7NnbcipTfLw8zMStCrZrTXAfdGxE5gFbA+pa8HTkvrq4ANkdkCzJV0DHAKsDki9kbEo8BmYEXaNycitkREABvqrtUoDzMzK0Gvgs2ZwJfS+vyIeDCtPwTMT+vHAg/kztmV0lql72qQ3iqPA0haK2lM0tjExMSUb8rMzDpTeLCRdAjwZuDL9ftSjSSKzL9VHhFxSUSMRsToyMhIkcUwMxtqvajZrARuiYiH0/bDqQmM9HNPSt8NLMydtyCltUpf0CC9VR5mZlaCXgSbs9jfhAawEaj1KFsDXJNLX516pS0DHk9NYZuA5ZLmpY4By4FNad+kpGWpF9rqums1ysPMzEpQ6Nxokp4NvB74L7nkjwBXSToH2AmckdKvBU4Fxsl6rp0NEBF7JX0I2JqOuzAi9qb1dwCXA4cD16WlVR5mZlYCZY80bHR0NMbGxsouhpnZQJG0LSJG2x3nGQTMzKxwDjZmZlY4BxszMyucg42ZmRXOwcbMzArnYGNmZoVzsDEzs8I52JiZWeEcbMzMrHAONmZmVjgHGzMzK5yDjZmZFc7Bxsw6MmcOSM9c5swpu2Q2CBxszKwj+/ZNLd0sz8HGzMwK52BjZmaFc7Axw88jzIrmYGOGn0eYFc3Bxsw6Mnv21NLN8goNNpLmSrpa0vcl3S3pZZKOlLRZ0o70c146VpIukjQu6Q5JS3PXWZOO3yFpTS79BEl3pnMukqSU3jAPM+ve5CREPHOZnCy7ZDYIiq7ZfBz4x4h4MfAbwN3AecD1EbEYuD5tA6wEFqdlLXAxZIEDWAecBJwIrMsFj4uBc3PnrUjpzfIwM7MSFBZsJD0HeBVwKUBEPBkRjwGrgPXpsPXAaWl9FbAhMluAuZKOAU4BNkfE3oh4FNgMrEj75kTElogIYEPdtRrlYWZmJSiyZnMcMAH8b0m3SvqspGcD8yPiwXTMQ8D8tH4s8EDu/F0prVX6rgbptMjjAJLWShqTNDYxMdHNPVpF+HmEWbGKDDYHAUuBiyPipcBPqGvOSjWSKLAMLfOIiEsiYjQiRkdGRooshvW5Xj6PcDdrG0ZFBptdwK6IuCltX00WfB5OTWCkn3vS/t3Awtz5C1Jaq/QFDdJpkYdZ6dzN2oZRYcEmIh4CHpD0opT0OuB7wEag1qNsDXBNWt8IrE690pYBj6emsE3AcknzUseA5cCmtG9S0rLUC2113bUa5WFmZiU4qODrvxv4gqRDgB8AZ5MFuKsknQPsBM5Ix14LnAqMA0+kY4mIvZI+BGxNx10YEXvT+juAy4HDgevSAvCRJnmYmVkJlD3SsNHR0RgbGyu7GDYEstFgjfm/ow0aSdsiYrTdcZ5BwMzMCudgY9Zj7mZtw6joZzZmVsfTu9gwcs3GrE6vxsF4vI0NEwcbszq9Ggfj8TY2TBxszMyscA42ZmZWOAcbMzMrnIONWUW5A4L1Ewcbszq9GgdTdD7ugGD9xONszOr0ahyMx9vYMHHNxszMCudgY2ZmhXOwMTOzwjnYmFWUJ/y0fuIOAmYV5Q4I1k9cszEzs8I52JiZWeEKDTaS7pd0p6TbJI2ltCMlbZa0I/2cl9Il6SJJ45LukLQ0d5016fgdktbk0k9I1x9P56pVHjbcPKLerDy9qNm8NiKOz72j+jzg+ohYDFyftgFWAovTsha4GLLAAawDTgJOBNblgsfFwLm581a0ycOGmEfUm5WnjGa0VcD6tL4eOC2XviEyW4C5ko4BTgE2R8TeiHgU2AysSPvmRMSWiAhgQ921GuVhZmYlKDrYBPANSdskrU1p8yPiwbT+EDA/rR8LPJA7d1dKa5W+q0F6qzwOIGmtpDFJYxMTE1O+OTMz60zRweaVEbGUrInsnZJeld+ZaiRRZAFa5RERl0TEaESMjoyMFFkMsxnnZ1A2SAoNNhGxO/3cA3yN7JnLw6kJjPRzTzp8N7Awd/qClNYqfUGDdFrkYVYZfgZlg6SwYCPp2ZJm19aB5cBdwEag1qNsDXBNWt8IrE690pYBj6emsE3AcknzUseA5cCmtG9S0rLUC2113bUa5WFDzCPqzcpT5AwC84Gvpd7IBwFfjIh/lLQVuErSOcBO4Ix0/LXAqcA48ARwNkBE7JX0IWBrOu7CiNib1t8BXA4cDlyXFoCPNMnDhphH1HdvzpzGNabZs/17tc4oe6Rho6OjMTY2VnYxzDqWfY9rbKb/W/cyLxsskrblhrY05RkEzGaAH9abteZgY5VQ9od9GQ/r/QzKBolnfbZKGMaeWX5WYoOk45qNpFdKOjutj0g6rrhimZlZlXQUbCStA94HnJ+SDgY+X1ShzHqh7Ka3QeImO5uuTpvRfht4KXALQET8sDaGxmxQDWPTW7fcZGfT1Wkz2pP5aV/SIE0zS1p9w3etyazzYHOVpL8lm4n5XOCbwGeKK5bZ1JTdzDM5mY03qS3NuNZkw6qjZrSI+GtJrwcmgRcBH4iIzYWWzKyBViPZB2Vw4Zw5zZulPFLfqqqjYJN6nv2fWoCRdLikRRFxf5GFM6tXhecsrcpahfsza6TTZrQvA7/IbT+V0swGVtlNb0VwDzvrV532RjsoIp6sbUTEk5IOKahMZj1RxWYp14ysX3Vas5mQ9ObahqRVwI+KKZJZd7r9Vl9EbaBd7ci1Dxs2ndZs/gD4gqRPACJ7TfPqwkpl1oVuv9UXURuYnGw9U/JM52fW7zrtjXYvsEzSEWn7x4WWyqyJ2bOb99Zq9WEt9b5HV7syTeWcQX6OZAZtgo2k/xQRn5f0R3XpAETERwssm9kztAoW7WoSvag5dNJ1uVU5q/gcyQza12xqMwX4e5VZB8p+QN+qNtVqfI9Z0VoGm4j4W0mzgMmI+FiPymRmXWr1rMjPhKxMbXujRcRTwFk9KIvZlNT3IutWp/OaTbfXWK283ZTDbNB12vX5u5I+IenfS1paWzo5UdIsSbdK+nraPk7STZLGJV1ZG68j6dC0PZ72L8pd4/yUfo+kU3LpK1LauKTzcukN87Bqmalv6vXzmhU1v1mr8yLcxGXV1mmwOR54CXAh8D/T8tcdnvuHwN257b8APhYRLwAeBc5J6ecAj6b0j6XjkLQEODPlvwL4VApgs4BPAiuBJcBZ6dhWedgAaDbupX6ZKtcczMrTUbCJiNc2WE5ud56kBcAbgM+mbQEnA1enQ9YDp6X1VWmbtP916fhVwBUR8dOIuA8YB05My3hE/CDNbnAFsKpNHjZNvRg4ORM1lkY1lV7UHKo4BY7ZTGgZbCSdJOl2ST+W9M+Sfn2K1/8b4L3sn1ftKOCxiPh52t4FHJvWjyUbLEra/3g6/un0unOapbfKo/7+1koakzQ2MTExxVsbTv00cLII+aDYjWZNcr0e3zOVdLNeaFez+STwX8k+wD9KFjw6IumNwJ6I2NZ98YoVEZdExGhEjI6MjJRdHJtB3T7Mn4ng16wW1yv9EPDM6rULNs+KiM2pCevLwFQ+kV8BvFnS/WRNXCcDHyd7AVuty/UCYHda3w0sBEj7nwM8kk+vO6dZ+iMt8rAhU0TNqVENIR9gupk1oEyeKdp6oV2wmSvpd2pLg+2mIuL8iFgQEYvIHvDfEBFvBW4ETk+HrQGuSesb0zZp/w3pVdQbgTNTb7XjgMXAzcBWYHHqeXZIymNjOqdZHtaHyqoBTFWrGkKnAaYfaxyD0sRpg63dDALfAt7UZDuAr3aR5/uAKyR9GLgVuDSlXwp8TtI4sJcseBAR2yVdBXwP+DnwzjT2B0nvAjYBs4DLImJ7mzzMzKwEikF5l27BRkdHY2xsrOxi9L1uX1vc7LxemMqfeKuaVavrdFoj68f/bt3esxmApG0RMdruuI66PkuaL+lSSdel7SWSPHZlCHX78Ll2XtnPJ9pxTy6zYnQ6qPNysuaq56btfwHeU0SBrNp6XbuZapCYSjCdajdpBywbZp0Gm6Mj4irSeJk0huWpwkplNgOKfvjeSeCcPbs3HQGm06PMtTnrhU7f1PkTSUeRdQpA0jKyQZdmhevmJWQzpdUzqlZ6/axjOj3Kyu4NZ8Oh02DzR2RdkH9N0nfJxtuc3voUs+mrdTzopkt0J+e069jgbsFmM6PT10LfIunVwIsAAfdExM8KLZkNtV69wrk+aJTZa86sytq9FrrZwM0XSiIiuhlnY0OsWZPYdIJLvsmqqBrQTF2zV0HUrN+0q9m8qcW+bgd12hAr6oO2jBpJN8+SXGuyYdXutdBn96ogNjy6GRjaqkYE5XyId/ssqQjtfj9mZeu0gwCS3kD2ArPDamkRcWERhbJqm8pD925nLOiFfgk0UP7vwqydTmcQ+DTwu8C7yToIvAV4XoHlMgPcG8ysKjod1PnyiFhN9trmDwIvA15YXLGsaqYy2n66LzDrd57G34ZRp8Hm/6WfT0h6Ltnsy8cUUySroqnURIap1jJM92rDrdNg8/eS5gJ/BdwC3Ad8sbBS2dDqtjbTrw/Ca1PV9IpfhGb9qtNg833gqYj4CtmrorcAf1dYqawyah9+Rah/4Vrtg72fAk+vmwP9jMv6VafB5k8jYp+kV5K93vmzwMXFFcuqolcfcvv27f8G7w9Ws/7TabCpzfD8BuAzEfEPwCHFFMmse90Gmn6qDZlVUafBZrekvyXr/nytpEOncK5Z3yurNuQgZ8Oi04BxBtnL006JiMeAI4E/aXWCpMMk3SzpdknbJX0wpR8n6SZJ45KulHRISj80bY+n/Yty1zo/pd8j6ZRc+oqUNi7pvFx6wzzM+k2++c+syjoKNhHxRER8NSJ2pO0HI+IbbU77KXByRPwGcDywIr0H5y+Aj0XEC4BHgdrrpc8hG8fzAuBj6TgkLQHOJJu9YAXwKUmzJM0i66ywElgCnJWOpUUeZn1ppmpWfhGa9avCmsIi8+O0eXBagqyDwdUpfT1wWlpflbZJ+18nSSn9ioj4aUTcB4wDJ6ZlPCJ+EBFPAlcAq9I5zfKwITCdD9ZB/1Ceymut67nbtBWp0OcuqQZyG7AH2AzcCzyWXisNsAs4Nq0fCzwAT792+nHgqHx63TnN0o9qkUd9+dZKGpM0NjExMZ1btYJNpUvz5GT3QSP/YT1s3G3ailRosImIpyLieGABWU3kxUXmN1URcUlEjEbE6MjISNnFqYyippupBYJOj+1G/didXnKNwqqsJz3KUqeCG8nmVJsrqTbb9AJgd1rfDSwESPufAzyST687p1n6Iy3ysBnUrNmlyG/CnXwAV+WD2jUKq5LCgo2kkTTFDZIOB14P3E0WdE5Ph60BrknrG9M2af8NEREp/czUW+04YDFwM7AVWJx6nh1C1olgYzqnWR42g3r5YTjVQFbGB3X9cxIz26/j99l04Rhgfeo19izgqoj4uqTvAVdI+jBwK3BpOv5S4HOSxoG9ZMGDiNgu6Srge2QTgL4zIp4CkPQusi7Zs4DLImJ7utb7muRhVhjpwPfsdBJwqjqztVk9hb+CATA6OhpjY2NlF2Og+IOysan8l2r3O+z1JJ79+qI661+StkXEaLvjiqzZmA2lfAAZpA/qQSmnDSZPOWNWoOk8Oxr0MT9meQ421jV/GHam255xrmlYlTjYWNf8YdiZWu2mUVdxs2HhYGNNefqS5rqp1XncjA0zBxtrqtX0JcP+zTz/uymiOdFNlFY1Djb2DEW+yrmK9u2bfnDoZuJMs0HiYGPP4OaeqZvp4OAmTKsaBxuzGTDTNUHPwGxV42BjT3Pz2WBwrccGkYONPc3fmgeDaz02iBxszApW6zzgVzbbMPPcaGYdajbPWaumx/xEmu5hZsPMNRuzNsrojuxakFWNazZmLZT14d4qsLkThw0i12zsaf7WnJnq4Mr6AZlFv63TtR4bRK7ZWNOXZll/8rMfG0Su2Qw5Bxoz64XCgo2khZJulPQ9Sdsl/WFKP1LSZkk70s95KV2SLpI0LukOSUtz11qTjt8haU0u/QRJd6ZzLpKy1uxmedgzOdAcqFVTVD8MpuyHMph1o8iazc+BP46IJcAy4J2SlgDnAddHxGLg+rQNsBJYnJa1wMWQBQ5gHXAScCKwLhc8LgbOzZ23IqU3y8OsqXavcG41mLJXQWCmBnQ6aFmvFRZsIuLBiLglre8D7gaOBVYB69Nh64HT0voqYENktgBzJR0DnAJsjoi9EfEosBlYkfbNiYgtERHAhrprNcrDrKnp1PIGbVT/oJXXBl9PntlIWgS8FLgJmB8RD6ZdDwHz0/qxwAO503altFbpuxqk0yIPSzwPmpn1UuHBRtIRwFeA90TEAY0UqUZSUAfR9nlIWitpTNLYxMREkcXoK+4UYGa9VmiwkXQwWaD5QkR8NSU/nJrASD/3pPTdwMLc6QtSWqv0BQ3SW+VxgIi4JCJGI2J0ZGSku5scQA40Zu35udbMKrI3moBLgbsj4qO5XRuBWo+yNcA1ufTVqVfaMuDx1BS2CVguaV7qGLAc2JT2TUpalvJaXXetRnlUnv+DFMeDJoeLn2vNrCIHdb4C+D3gTkm3pbT/BnwEuErSOcBO4Iy071rgVGAceAI4GyAi9kr6ELA1HXdhROxN6+8ALgcOB65LCy3yqLx2Pab8H6W5dsGk1lNtKs+6ZjpAzZ7d+N9wqvnM1HXMOqUoak6NATM6OhpjY2NlF2Pa/NB/arr585/K79j/vQZXp7N5DztJ2yJitN1xnkHAhlqrZsdm+zrlWoLZfg42NtRaNTu2anKsTbTZalJMz2Fmtp+DTUV43Ew5Jicbz/bsQDP4PLv2zHKwqQg/+O8t9/irvip/kSij16qDzYCZ7nOEYdau6Wu6HPBtUJTRrdvBZsD4A236/Ds06z2/PM2GhmuAZuVxzcYsx+MnzIrhYGNmZoVzsBkQ7trcO912IHCXWBsUZXTr9jObPubg0ntz5jTv2tru36MKXWJtOJTxt+qajVnOvn3Nxx645mLWPQebPuUBgv2j1lXaNRez7jnY9CmPBTGzKnGw6SP52QHMzKrEwaYP1IKMazP9z5MzmnXHvdH6gINMcWqDNGeqtujnNmbdcbCxSnOTpFl/cDOaDbzZsw+cAt7M+k9hwUbSZZL2SLorl3akpM2SdqSf81K6JF0kaVzSHZKW5s5Zk47fIWlNLv0ESXemcy6Ssu+wzfKw6qnS+0XMqq7Ims3lwIq6tPOA6yNiMXB92gZYCSxOy1rgYsgCB7AOOAk4EViXCx4XA+fmzlvRJg+zplwjMitWYcEmIr4N7K1LXgWsT+vrgdNy6RsiswWYK+kY4BRgc0TsjYhHgc3AirRvTkRsiYgANtRdq1EefcfznZnZsOj1M5v5EfFgWn8ImJ/WjwUeyB23K6W1St/VIL1VHs8gaa2kMUljExMTXdxO9+bMcS+0dmbP7l1XYwd9s2KV1kEg1UgKbbxol0dEXBIRoxExOjIyUmRRnsGBpjPdvAd+pgORx9CYTV+vg83DqQmM9HNPSt8NLMwdtyCltUpf0CC9VR42YLoNyM0CVDdBwx0QzGZGr4PNRqDWo2wNcE0ufXXqlbYMeDw1hW0ClkualzoGLAc2pX2TkpalXmir667VKA8rUT/UDpoFITMrXmGDOiV9CXgNcLSkXWS9yj4CXCXpHGAncEY6/FrgVGAceAI4GyAi9kr6ELA1HXdhRNQ6HbyDrMfb4cB1aaFFHlaS2bP31w78rMpsOCn81Q6A0dHRGBsb61l+w/RAut2fWKvfRS/+PMvO32yQSdoWEaPtjvMMAj2Qn83Zszr3H0+uaVY8z43WA8PcbNTJB/bs2Y1/R736sHcHALPiOdjMsGF/JtFNs5M/7M2qz81oM6zqgaaXAy3NrDpcs7GnRfhhuZkVw8GmZPkPcHccMLOqcjNaidzsZGbDwjWbHurXZqh+LZeZVYdrNl1qNHamVTPYoNRiWpVTyu7bzGyqHGy61KrXWaNJH/ftm/kP6/zrkKdzjbza/GHNVL23nZkVw81oBWn2odzqw7rZ4Mb8/m7GpLiZzMzK5ppNH2kXSDz40cwGlYONmZkVzsHGzMwK52DTpX6bsmWmy9Nv92dmg80dBLrU7vlJtzMZd3veTD/P8fMhM5tJDjYF6fbD2h/yZlZFbkYzM7PCVTbYSFoh6R5J45LOK7s8ZmbDrJLBRtIs4JPASmAJcJakJeWWysxseFUy2AAnAuMR8YOIeBK4AlhVcpnMzIZWVYPNscADue1dKe0AktZKGpM0NjEx0bPCmZkNm6HujRYRlwCXAEiakLSz5CLNtKOBH5VdiB4Yhvv0PVZH1e7zeZ0cVNVgsxtYmNtekNKaioiRQktUAkljETFadjmKNgz36XusjmG5z3pVbUbbCiyWdJykQ4AzgY0ll8nMbGhVsmYTET+X9C5gEzALuCwitpdcLDOzoVXJYAMQEdcC15ZdjpJdUnYBemQY7tP3WB3Dcp8HUPjNWmZmVrCqPrMxM7M+4mBjZmaFc7AZMJIuk7RH0l25tCMlbZa0I/2cl9Il6aI0P9wdkpbmzlmTjt8haU0Z99KMpIWSbpT0PUnbJf1hSq/MfUo6TNLNkm5P9/jBlH6cpJvSvVyZelMi6dC0PZ72L8pd6/yUfo+kU8q5o+YkzZJ0q6Svp+0q3uP9ku6UdJuksZRWmb/XGRERXgZoAV4FLAXuyqX9JXBeWj8P+Iu0fipwHSBgGXBTSj8S+EH6OS+tzyv73nL3cwywNK3PBv6FbI67ytxnKusRaf1g4KZU9quAM1P6p4G3p/V3AJ9O62cCV6b1JcDtwKHAccC9wKyy769aFG1kAAAFLElEQVTuXv8I+CLw9bRdxXu8Hzi6Lq0yf68zsbhmM2Ai4tvA3rrkVcD6tL4eOC2XviEyW4C5ko4BTgE2R8TeiHgU2AysKL70nYmIByPilrS+D7ibbLqhytxnKuuP0+bBaQngZODqlF5/j7V7vxp4nSSl9Csi4qcRcR8wTjY3YF+QtAB4A/DZtC0qdo8tVObvdSY42FTD/Ih4MK0/BMxP683miOto7rh+kJpSXkr2zb9S95mal24D9pB9sNwLPBYRP0+H5Mv79L2k/Y8DR9Hn9wj8DfBe4Bdp+yiqd4+QfVH4hqRtktamtEr9vU5XZcfZDKuICEmV6M8u6QjgK8B7ImIy+5KbqcJ9RsRTwPGS5gJfA15ccpFmlKQ3AnsiYpuk15RdnoK9MiJ2S/plYLOk7+d3VuHvdbpcs6mGh1M1nPRzT0pvNkfclOeO6zVJB5MFmi9ExFdTcuXuEyAiHgNuBF5G1qRS+xKYL+/T95L2Pwd4hP6+x1cAb5Z0P9lrPk4GPk617hGAiNidfu4h++JwIhX9e+2Wg001bARqPVfWANfk0len3i/LgMdTtX4TsFzSvNRDZnlK6wupnf5S4O6I+GhuV2XuU9JIqtEg6XDg9WTPpm4ETk+H1d9j7d5PB26I7KnyRuDM1JPrOGAxcHNv7qK1iDg/IhZExCKyB/43RMRbqdA9Akh6tqTZtXWyv7O7qNDf64wou4eCl6ktwJeAB4GfkbXpnkPWrn09sAP4JnBkOlZkbyy9F7gTGM1d5z+TPWgdB84u+77q7vGVZG3gdwC3peXUKt0n8O+AW9M93gV8IKU/n+yDdBz4MnBoSj8sbY+n/c/PXev96d7vAVaWfW9N7vc17O+NVql7TPdze1q2A+9P6ZX5e52JxdPVmJlZ4dyMZmZmhXOwMTOzwjnYmJlZ4RxszMyscA42ZmZWOAcbsxkgab6kL0r6QZqy5J8l/XaD4xYpN2N3Lv1CSb/VQT7HSwpJlZkzy4aDg43ZNKVBqH8HfDsinh8RJ5ANYlxQd1zT6aEi4gMR8c0OsjsL+E762bAskvz/2vqO/yjNpu9k4MmI+HQtISJ2RsT/kvQ2SRsl3UA2wK8hSZdLOl3SCklfzqW/RvvfAyPgLcDbgNdLOiylL0rvedlANkB0oaTlqXZ1i6Qvp3nmkPQBSVsl3SXpEuUnnDMrkION2fS9BLilxf6lwOkR8eoOrvVN4KQ07QnA75LNKwbwcuC+iLgX+CeyqftrFgOfioiXAD8B/jvwWxGxFBgje6cMwCci4jcj4t8AhwNv7KBMZtPmYGM2wyR9UtkbOLempM0RUf8OooYim1r/H4E3pWa3N7B/Tq2z2B94ruDAprSdkb0bBbIXci0BvpteYbAGeF7a91plb8G8k6xG9pKp36HZ1PkVA2bTtx34D7WNiHinpKPJahSQ1TSm4grgXWQvyRuLiH2SZqU8Vkl6P9n8WkfVJoCsy0NkAe6A5zqp2e1TZHNxPSDpArL5yMwK55qN2fTdABwm6e25tF+axvW+Rdb0di77azKvA+6IiIURsSginkf2CoZn9HgDtgCvkPQCeHpW4heyP7D8KD3DOb3BuWaFcLAxm6bIZrM9DXi1pPsk3Uz2GuD3NTnlRZJ25Za31F3vKeDrwMr0E7Ims6/VXecrNOiVFhETZJ0IviTpDuCfgRdH9t6cz5B1ItgEbK0/16wonvXZzMwK55qNmZkVzsHGzMwK52BjZmaFc7AxM7PCOdiYmVnhHGzMzKxwDjZmZla4/w92Vqp3DCBW3QAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x115f6e2e8>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.scatter(train.GrLivArea, train.SalePrice, c='blue', marker='s')\n",
    "plt.title('Looking for outliers')\n",
    "plt.xlabel('GrLivArea')\n",
    "plt.ylabel('salePrice')\n",
    "plt.show()\n",
    "\n",
    "train = train[train.GrLivArea < 4000]\n",
    "temp = train.reindex()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Data columns (total 80 columns):\n",
      "MSSubClass       1456 non-null int64\n",
      "MSZoning         1456 non-null object\n",
      "LotFrontage      1197 non-null float64\n",
      "LotArea          1456 non-null int64\n",
      "Street           1456 non-null object\n",
      "Alley            91 non-null object\n",
      "LotShape         1456 non-null object\n",
      "LandContour      1456 non-null object\n",
      "Utilities        1456 non-null object\n",
      "LotConfig        1456 non-null object\n",
      "LandSlope        1456 non-null object\n",
      "Neighborhood     1456 non-null object\n",
      "Condition1       1456 non-null object\n",
      "Condition2       1456 non-null object\n",
      "BldgType         1456 non-null object\n",
      "HouseStyle       1456 non-null object\n",
      "OverallQual      1456 non-null int64\n",
      "OverallCond      1456 non-null int64\n",
      "YearBuilt        1456 non-null int64\n",
      "YearRemodAdd     1456 non-null int64\n",
      "RoofStyle        1456 non-null object\n",
      "RoofMatl         1456 non-null object\n",
      "Exterior1st      1456 non-null object\n",
      "Exterior2nd      1456 non-null object\n",
      "MasVnrType       1448 non-null object\n",
      "MasVnrArea       1448 non-null float64\n",
      "ExterQual        1456 non-null object\n",
      "ExterCond        1456 non-null object\n",
      "Foundation       1456 non-null object\n",
      "BsmtQual         1419 non-null object\n",
      "BsmtCond         1419 non-null object\n",
      "BsmtExposure     1418 non-null object\n",
      "BsmtFinType1     1419 non-null object\n",
      "BsmtFinSF1       1456 non-null int64\n",
      "BsmtFinType2     1418 non-null object\n",
      "BsmtFinSF2       1456 non-null int64\n",
      "BsmtUnfSF        1456 non-null int64\n",
      "TotalBsmtSF      1456 non-null int64\n",
      "Heating          1456 non-null object\n",
      "HeatingQC        1456 non-null object\n",
      "CentralAir       1456 non-null object\n",
      "Electrical       1455 non-null object\n",
      "1stFlrSF         1456 non-null int64\n",
      "2ndFlrSF         1456 non-null int64\n",
      "LowQualFinSF     1456 non-null int64\n",
      "GrLivArea        1456 non-null int64\n",
      "BsmtFullBath     1456 non-null int64\n",
      "BsmtHalfBath     1456 non-null int64\n",
      "FullBath         1456 non-null int64\n",
      "HalfBath         1456 non-null int64\n",
      "BedroomAbvGr     1456 non-null int64\n",
      "KitchenAbvGr     1456 non-null int64\n",
      "KitchenQual      1456 non-null object\n",
      "TotRmsAbvGrd     1456 non-null int64\n",
      "Functional       1456 non-null object\n",
      "Fireplaces       1456 non-null int64\n",
      "FireplaceQu      766 non-null object\n",
      "GarageType       1375 non-null object\n",
      "GarageYrBlt      1375 non-null float64\n",
      "GarageFinish     1375 non-null object\n",
      "GarageCars       1456 non-null int64\n",
      "GarageArea       1456 non-null int64\n",
      "GarageQual       1375 non-null object\n",
      "GarageCond       1375 non-null object\n",
      "PavedDrive       1456 non-null object\n",
      "WoodDeckSF       1456 non-null int64\n",
      "OpenPorchSF      1456 non-null int64\n",
      "EnclosedPorch    1456 non-null int64\n",
      "3SsnPorch        1456 non-null int64\n",
      "ScreenPorch      1456 non-null int64\n",
      "PoolArea         1456 non-null int64\n",
      "PoolQC           5 non-null object\n",
      "Fence            280 non-null object\n",
      "MiscFeature      54 non-null object\n",
      "MiscVal          1456 non-null int64\n",
      "MoSold           1456 non-null int64\n",
      "YrSold           1456 non-null int64\n",
      "SaleType         1456 non-null object\n",
      "SaleCondition    1456 non-null object\n",
      "SalePrice        1456 non-null int64\n",
      "dtypes: float64(3), int64(34), object(43)\n",
      "memory usage: 921.4+ KB\n"
     ]
    }
   ],
   "source": [
    "train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>12</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \\\n",
       "0          60       RL         65.0     8450   Pave  None      Reg   \n",
       "1          20       RL         80.0     9600   Pave  None      Reg   \n",
       "2          60       RL         68.0    11250   Pave  None      IR1   \n",
       "3          70       RL         60.0     9550   Pave  None      IR1   \n",
       "4          60       RL         84.0    14260   Pave  None      IR1   \n",
       "\n",
       "  LandContour Utilities LotConfig    ...     PoolArea PoolQC Fence  \\\n",
       "0         Lvl    AllPub    Inside    ...            0     No    No   \n",
       "1         Lvl    AllPub       FR2    ...            0     No    No   \n",
       "2         Lvl    AllPub    Inside    ...            0     No    No   \n",
       "3         Lvl    AllPub    Corner    ...            0     No    No   \n",
       "4         Lvl    AllPub       FR2    ...            0     No    No   \n",
       "\n",
       "  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0          No       0      2    2008        WD         Normal     208500  \n",
       "1          No       0      5    2007        WD         Normal     181500  \n",
       "2          No       0      9    2008        WD         Normal     223500  \n",
       "3          No       0      2    2006        WD        Abnorml     140000  \n",
       "4          No       0     12    2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def process_missvalue_by_meaning (df):\n",
    "    # Alley : data description says NA means \"no alley access\"\n",
    "    df.loc[:, \"Alley\"] = df.loc[:, \"Alley\"].fillna(\"None\")\n",
    "\n",
    "    # BedroomAbvGr : NA most likely means 0\n",
    "    df.loc[:, \"BedroomAbvGr\"] = df.loc[:, \"BedroomAbvGr\"].fillna(0)\n",
    "\n",
    "    # BsmtQual etc : data description says NA for basement features is \"no basement\"\n",
    "    df.loc[:, \"BsmtQual\"] = df.loc[:, \"BsmtQual\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtCond\"] = df.loc[:, \"BsmtCond\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtExposure\"] = df.loc[:, \"BsmtExposure\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFinType1\"] = df.loc[:, \"BsmtFinType1\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFinType2\"] = df.loc[:, \"BsmtFinType2\"].fillna(\"No\")\n",
    "    df.loc[:, \"BsmtFullBath\"] = df.loc[:, \"BsmtFullBath\"].fillna(0)\n",
    "    df.loc[:, \"BsmtHalfBath\"] = df.loc[:, \"BsmtHalfBath\"].fillna(0)\n",
    "    df.loc[:, \"BsmtUnfSF\"] = df.loc[:, \"BsmtUnfSF\"].fillna(0)\n",
    "\n",
    "    # CentralAir : NA most likely means No\n",
    "    df.loc[:, \"CentralAir\"] = df.loc[:, \"CentralAir\"].fillna(\"N\")\n",
    "\n",
    "    # Condition : NA most likely means Normal，靠近主干道或铁路\n",
    "    df.loc[:, \"Condition1\"] = df.loc[:, \"Condition1\"].fillna(\"Norm\")\n",
    "    df.loc[:, \"Condition2\"] = df.loc[:, \"Condition2\"].fillna(\"Norm\")\n",
    "\n",
    "    # EnclosedPorch : NA most likely means no enclosed porch\n",
    "    df.loc[:, \"EnclosedPorch\"] = df.loc[:, \"EnclosedPorch\"].fillna(0)\n",
    "\n",
    "    # External stuff : NA most likely means average\n",
    "    df.loc[:, \"ExterCond\"] = df.loc[:, \"ExterCond\"].fillna(\"TA\")\n",
    "    df.loc[:, \"ExterQual\"] = df.loc[:, \"ExterQual\"].fillna(\"TA\")\n",
    "\n",
    "    # Fence : data description says NA means \"no fence\"\n",
    "    df.loc[:, \"Fence\"] = df.loc[:, \"Fence\"].fillna(\"No\")\n",
    "\n",
    "    # FireplaceQu : data description says NA means \"no fireplace\"\n",
    "    df.loc[:, \"FireplaceQu\"] = df.loc[:, \"FireplaceQu\"].fillna(\"No\")\n",
    "    df.loc[:, \"Fireplaces\"] = df.loc[:, \"Fireplaces\"].fillna(0)\n",
    "\n",
    "    # Functional : data description says NA means typical，家用（Home）功能性评级\n",
    "    df.loc[:, \"Functional\"] = df.loc[:, \"Functional\"].fillna(\"Typ\")\n",
    "\n",
    "    # GarageType etc : data description says NA for garage features is \"no garage\"\n",
    "    df.loc[:, \"GarageType\"] = df.loc[:, \"GarageType\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageFinish\"] = df.loc[:, \"GarageFinish\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageQual\"] = df.loc[:, \"GarageQual\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageCond\"] = df.loc[:, \"GarageCond\"].fillna(\"No\")\n",
    "    df.loc[:, \"GarageArea\"] = df.loc[:, \"GarageArea\"].fillna(0)\n",
    "    df.loc[:, \"GarageCars\"] = df.loc[:, \"GarageCars\"].fillna(0)\n",
    "\n",
    "    # HalfBath : NA most likely means no half baths above grade\n",
    "    df.loc[:, \"HalfBath\"] = df.loc[:, \"HalfBath\"].fillna(0)\n",
    "\n",
    "    # HeatingQC : NA most likely means typical\n",
    "    df.loc[:, \"HeatingQC\"] = df.loc[:, \"HeatingQC\"].fillna(\"TA\")\n",
    "\n",
    "    # KitchenAbvGr : NA most likely means 0\n",
    "    df.loc[:, \"KitchenAbvGr\"] = df.loc[:, \"KitchenAbvGr\"].fillna(0)\n",
    "\n",
    "    # KitchenQual : NA most likely means typical\n",
    "    df.loc[:, \"KitchenQual\"] = df.loc[:, \"KitchenQual\"].fillna(\"TA\")\n",
    "\n",
    "    # LotFrontage : NA most likely means no lot frontage\n",
    "    df.loc[:, \"LotFrontage\"] = df.loc[:, \"LotFrontage\"].fillna(0)\n",
    "\n",
    "    # LotShape : NA most likely means regular\n",
    "    df.loc[:, \"LotShape\"] = df.loc[:, \"LotShape\"].fillna(\"Reg\")\n",
    "\n",
    "    # MasVnrType : NA most likely means no veneer，表层砌体（Masonry veneer）类型\n",
    "    df.loc[:, \"MasVnrType\"] = df.loc[:, \"MasVnrType\"].fillna(\"None\")\n",
    "    df.loc[:, \"MasVnrArea\"] = df.loc[:, \"MasVnrArea\"].fillna(0)\n",
    "\n",
    "    # MiscFeature : data description says NA means \"no misc feature\"\n",
    "    df.loc[:, \"MiscFeature\"] = df.loc[:, \"MiscFeature\"].fillna(\"No\")\n",
    "    df.loc[:, \"MiscVal\"] = df.loc[:, \"MiscVal\"].fillna(0)\n",
    "\n",
    "    # OpenPorchSF : NA most likely means no open porch\n",
    "    df.loc[:, \"OpenPorchSF\"] = df.loc[:, \"OpenPorchSF\"].fillna(0)\n",
    "\n",
    "    # PavedDrive : NA most likely means not paved\n",
    "    df.loc[:, \"PavedDrive\"] = df.loc[:, \"PavedDrive\"].fillna(\"N\")\n",
    "\n",
    "    # PoolQC : data description says NA means \"no pool\"\n",
    "    df.loc[:, \"PoolQC\"] = df.loc[:, \"PoolQC\"].fillna(\"No\")\n",
    "    df.loc[:, \"PoolArea\"] = df.loc[:, \"PoolArea\"].fillna(0)\n",
    "\n",
    "    # SaleCondition : NA most likely means normal sale\n",
    "    df.loc[:, \"SaleCondition\"] = df.loc[:, \"SaleCondition\"].fillna(\"Normal\")\n",
    "\n",
    "    # ScreenPorch : NA most likely means no screen porch，观景门廊\n",
    "    df.loc[:, \"ScreenPorch\"] = df.loc[:, \"ScreenPorch\"].fillna(0)\n",
    "\n",
    "    # TotRmsAbvGrd : NA most likely means 0\n",
    "    df.loc[:, \"TotRmsAbvGrd\"] = df.loc[:, \"TotRmsAbvGrd\"].fillna(0)\n",
    "\n",
    "    # Utilities : NA most likely means all public utilities\n",
    "    df.loc[:, \"Utilities\"] = df.loc[:, \"Utilities\"].fillna(\"AllPub\")\n",
    "\n",
    "    # WoodDeckSF : NA most likely means no wood deck\n",
    "    df.loc[:, \"WoodDeckSF\"] = df.loc[:, \"WoodDeckSF\"].fillna(0)\n",
    "    \n",
    "    return df\n",
    "    \n",
    "train = process_missvalue_by_meaning(train)\n",
    "test = process_missvalue_by_meaning(test)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Feb</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>SC20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>Reg</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>May</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Sep</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SC70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Feb</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>Pave</td>\n",
       "      <td>None</td>\n",
       "      <td>IR1</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>AllPub</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Dec</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape LandContour  \\\n",
       "0       SC60       RL         65.0     8450   Pave  None      Reg         Lvl   \n",
       "1       SC20       RL         80.0     9600   Pave  None      Reg         Lvl   \n",
       "2       SC60       RL         68.0    11250   Pave  None      IR1         Lvl   \n",
       "3       SC70       RL         60.0     9550   Pave  None      IR1         Lvl   \n",
       "4       SC60       RL         84.0    14260   Pave  None      IR1         Lvl   \n",
       "\n",
       "  Utilities LotConfig    ...     PoolArea PoolQC Fence MiscFeature MiscVal  \\\n",
       "0    AllPub    Inside    ...            0     No    No          No       0   \n",
       "1    AllPub       FR2    ...            0     No    No          No       0   \n",
       "2    AllPub    Inside    ...            0     No    No          No       0   \n",
       "3    AllPub    Corner    ...            0     No    No          No       0   \n",
       "4    AllPub       FR2    ...            0     No    No          No       0   \n",
       "\n",
       "  MoSold  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0    Feb    2008        WD         Normal     208500  \n",
       "1    May    2007        WD         Normal     181500  \n",
       "2    Sep    2008        WD         Normal     223500  \n",
       "3    Feb    2006        WD        Abnorml     140000  \n",
       "4    Dec    2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def numberical2cat(df):\n",
    "    df.replace({\"MSSubClass\" : {20 : \"SC20\", 30 : \"SC30\", 40 : \"SC40\", 45 : \"SC45\", \n",
    "                                       50 : \"SC50\", 60 : \"SC60\", 70 : \"SC70\", 75 : \"SC75\", \n",
    "                                       80 : \"SC80\", 85 : \"SC85\", 90 : \"SC90\", 120 : \"SC120\", \n",
    "                                       150 : \"SC150\", 160 : \"SC160\", 180 : \"SC180\", 190 : \"SC190\"},\n",
    "                       \"MoSold\" : {1 : \"Jan\", 2 : \"Feb\", 3 : \"Mar\", 4 : \"Apr\", 5 : \"May\", 6 : \"Jun\",\n",
    "                                   7 : \"Jul\", 8 : \"Aug\", 9 : \"Sep\", 10 : \"Oct\", 11 : \"Nov\", 12 : \"Dec\"}\n",
    "                      }, inplace = True)\n",
    "\n",
    "    return df\n",
    "train = numberical2cat(train)\n",
    "test = numberical2cat(test)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>PoolArea</th>\n",
       "      <th>PoolQC</th>\n",
       "      <th>Fence</th>\n",
       "      <th>MiscFeature</th>\n",
       "      <th>MiscVal</th>\n",
       "      <th>MoSold</th>\n",
       "      <th>YrSold</th>\n",
       "      <th>SaleType</th>\n",
       "      <th>SaleCondition</th>\n",
       "      <th>SalePrice</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Feb</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>208500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>SC20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>May</td>\n",
       "      <td>2007</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>181500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Sep</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>223500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SC70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Feb</td>\n",
       "      <td>2006</td>\n",
       "      <td>WD</td>\n",
       "      <td>Abnorml</td>\n",
       "      <td>140000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>No</td>\n",
       "      <td>No</td>\n",
       "      <td>0</td>\n",
       "      <td>Dec</td>\n",
       "      <td>2008</td>\n",
       "      <td>WD</td>\n",
       "      <td>Normal</td>\n",
       "      <td>250000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 80 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  MSSubClass MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \\\n",
       "0       SC60       RL         65.0     8450       2      0         4   \n",
       "1       SC20       RL         80.0     9600       2      0         4   \n",
       "2       SC60       RL         68.0    11250       2      0         3   \n",
       "3       SC70       RL         60.0     9550       2      0         3   \n",
       "4       SC60       RL         84.0    14260       2      0         3   \n",
       "\n",
       "  LandContour  Utilities LotConfig    ...      PoolArea PoolQC Fence  \\\n",
       "0         Lvl          4    Inside    ...             0      0    No   \n",
       "1         Lvl          4       FR2    ...             0      0    No   \n",
       "2         Lvl          4    Inside    ...             0      0    No   \n",
       "3         Lvl          4    Corner    ...             0      0    No   \n",
       "4         Lvl          4       FR2    ...             0      0    No   \n",
       "\n",
       "  MiscFeature MiscVal MoSold  YrSold  SaleType  SaleCondition  SalePrice  \n",
       "0          No       0    Feb    2008        WD         Normal     208500  \n",
       "1          No       0    May    2007        WD         Normal     181500  \n",
       "2          No       0    Sep    2008        WD         Normal     223500  \n",
       "3          No       0    Feb    2006        WD        Abnorml     140000  \n",
       "4          No       0    Dec    2008        WD         Normal     250000  \n",
       "\n",
       "[5 rows x 80 columns]"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def cat2numberical(df):\n",
    "    df.replace({\"Alley\" : {\"None\":0, \"Grvl\" : 1, \"Pave\" : 2},\n",
    "                \"BsmtCond\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"BsmtExposure\" : {\"No\" : 0, \"Mn\" : 1, \"Av\": 2, \"Gd\" : 3},\n",
    "                \"BsmtFinType1\" : {\"No\" : 0, \"Unf\" : 1, \"LwQ\": 2, \"Rec\" : 3, \"BLQ\" : 4, \n",
    "                                         \"ALQ\" : 5, \"GLQ\" : 6},\n",
    "                \"BsmtFinType2\" : {\"No\" : 0, \"Unf\" : 1, \"LwQ\": 2, \"Rec\" : 3, \"BLQ\" : 4, \n",
    "                                         \"ALQ\" : 5, \"GLQ\" : 6},\n",
    "                \"BsmtQual\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"ExterCond\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\": 4, \"Ex\" : 5},\n",
    "                \"ExterQual\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\": 3, \"Gd\": 4, \"Ex\" : 5},\n",
    "                \"FireplaceQu\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"Functional\" : {\"Sal\" : 1, \"Sev\" : 2, \"Maj2\" : 3, \"Maj1\" : 4, \"Mod\": 5, \n",
    "                                       \"Min2\" : 6, \"Min1\" : 7, \"Typ\" : 8},\n",
    "                \"GarageCond\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"GarageQual\" : {\"No\" : 0, \"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"HeatingQC\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"KitchenQual\" : {\"Po\" : 1, \"Fa\" : 2, \"TA\" : 3, \"Gd\" : 4, \"Ex\" : 5},\n",
    "                \"LandSlope\" : {\"Sev\" : 1, \"Mod\" : 2, \"Gtl\" : 3},\n",
    "                \"LotShape\" : {\"IR3\" : 1, \"IR2\" : 2, \"IR1\" : 3, \"Reg\" : 4},\n",
    "                \"PavedDrive\" : {\"N\" : 0, \"P\" : 1, \"Y\" : 2},\n",
    "                \"PoolQC\" : {\"No\" : 0, \"Fa\" : 1, \"TA\" : 2, \"Gd\" : 3, \"Ex\" : 4},\n",
    "                \"Street\" : {\"Grvl\" : 1, \"Pave\" : 2},\n",
    "                \"Utilities\" : {\"ELO\" : 1, \"NoSeWa\" : 2, \"NoSewr\" : 3, \"AllPub\" : 4}},\n",
    "                       inplace = True\n",
    "                     )\n",
    "    return df\n",
    "\n",
    "train = cat2numberical(train)\n",
    "test = cat2numberical(test)\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0       3\n",
      "1       2\n",
      "2       3\n",
      "3       3\n",
      "4       3\n",
      "5       2\n",
      "6       3\n",
      "7       3\n",
      "8       3\n",
      "9       2\n",
      "10      2\n",
      "11      3\n",
      "12      2\n",
      "13      3\n",
      "14      2\n",
      "15      3\n",
      "16      2\n",
      "17      2\n",
      "18      2\n",
      "19      2\n",
      "20      3\n",
      "21      3\n",
      "22      3\n",
      "23      2\n",
      "24      2\n",
      "25      3\n",
      "26      2\n",
      "27      3\n",
      "28      2\n",
      "29      2\n",
      "       ..\n",
      "1430    2\n",
      "1431    2\n",
      "1432    2\n",
      "1433    2\n",
      "1434    2\n",
      "1435    2\n",
      "1436    2\n",
      "1437    3\n",
      "1438    2\n",
      "1439    3\n",
      "1440    2\n",
      "1441    2\n",
      "1442    3\n",
      "1443    2\n",
      "1444    3\n",
      "1445    2\n",
      "1446    2\n",
      "1447    3\n",
      "1448    2\n",
      "1449    2\n",
      "1450    2\n",
      "1451    3\n",
      "1452    2\n",
      "1453    2\n",
      "1454    3\n",
      "1455    2\n",
      "1456    2\n",
      "1457    3\n",
      "1458    2\n",
      "1459    2\n",
      "Name: OverallQual, Length: 1456, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "def simplify(df):\n",
    "    df[\"SimplOverallQual\"] = df.OverallQual.replace({1 : 1, 2 : 1, 3 : 1, # bad\n",
    "                                                    4 : 2, 5 : 2, 6 : 2, # average\n",
    "                                                    7 : 3, 8 : 3, 9 : 3, 10 : 3 # good\n",
    "                                                    }, inplace = True)\n",
    "    df[\"SimplOverallCond\"] = df.OverallCond.replace({1 : 1, 2 : 1, 3 : 1, # bad\n",
    "                                                    4 : 2, 5 : 2, 6 : 2, # average\n",
    "                                                    7 : 3, 8 : 3, 9 : 3, 10 : 3 # good\n",
    "                                                    },inplace = True)\n",
    "    df[\"SimplPoolQC\"] = df.PoolQC.replace({1 : 1, 2 : 1, # average\n",
    "                                           3 : 2, 4 : 2 # good\n",
    "                                          },inplace = True)\n",
    "    df[\"SimplGarageCond\"] = df.GarageCond.replace({1 : 1, # bad\n",
    "                                                2 : 1, 3 : 1, # average\n",
    "                                                4 : 2, 5 : 2 # good\n",
    "                                                        },inplace = True)\n",
    "    df[\"SimplGarageQual\"] = df.GarageQual.replace({1 : 1, # bad\n",
    "                                                    2 : 1, 3 : 1, # average\n",
    "                                                    4 : 2, 5 : 2 # good\n",
    "                                                    },inplace = True)\n",
    "    df[\"SimplFireplaceQu\"] = df.FireplaceQu.replace({1 : 1, # bad\n",
    "                                                           2 : 1, 3 : 1, # average\n",
    "                                                           4 : 2, 5 : 2 # good\n",
    "                                                          },inplace = True)\n",
    "    df[\"SimplFireplaceQu\"] = df.FireplaceQu.replace({1 : 1, # bad\n",
    "                                                           2 : 1, 3 : 1, # average\n",
    "                                                           4 : 2, 5 : 2 # good\n",
    "                                                          },inplace = True)\n",
    "    df[\"SimplFunctional\"] = df.Functional.replace({1 : 1, 2 : 1, # bad\n",
    "                                                         3 : 2, 4 : 2, # major\n",
    "                                                         5 : 3, 6 : 3, 7 : 3, # minor\n",
    "                                                         8 : 4 # typical\n",
    "                                                        },inplace = True)\n",
    "    df[\"SimplKitchenQual\"] = df.KitchenQual.replace({1 : 1, # bad\n",
    "                                                           2 : 1, 3 : 1, # average\n",
    "                                                           4 : 2, 5 : 2 # good\n",
    "                                                          },inplace = True)\n",
    "    df[\"SimplHeatingQC\"] = df.HeatingQC.replace({1 : 1, # bad\n",
    "                                                       2 : 1, 3 : 1, # average\n",
    "                                                       4 : 2, 5 : 2 # good\n",
    "                                                      },inplace = True)\n",
    "    df[\"SimplBsmtFinType1\"] = df.BsmtFinType1.replace({1 : 1, # unfinished\n",
    "                                                             2 : 1, 3 : 1, # rec room\n",
    "                                                             4 : 2, 5 : 2, 6 : 2 # living quarters\n",
    "                                                            },inplace = True)\n",
    "    df[\"SimplBsmtFinType2\"] = df.BsmtFinType2.replace({1 : 1, # unfinished\n",
    "                                                             2 : 1, 3 : 1, # rec room\n",
    "                                                             4 : 2, 5 : 2, 6 : 2 # living quarters\n",
    "                                                            },inplace = True)\n",
    "    df[\"SimplBsmtCond\"] = df.BsmtCond.replace({1 : 1, # bad\n",
    "                                                     2 : 1, 3 : 1, # average\n",
    "                                                     4 : 2, 5 : 2 # good\n",
    "                                                    },inplace = True)\n",
    "    df[\"SimplBsmtQual\"] = df.BsmtQual.replace({1 : 1, # bad\n",
    "                                                     2 : 1, 3 : 1, # average\n",
    "                                                     4 : 2, 5 : 2 # good\n",
    "                                                    },inplace = True)\n",
    "    df[\"SimplExterCond\"] = df.ExterCond.replace({1 : 1, # bad\n",
    "                                                       2 : 1, 3 : 1, # average\n",
    "                                                       4 : 2, 5 : 2 # good\n",
    "                                                      },inplace = True)\n",
    "    df[\"SimplExterQual\"] = df.ExterQual.replace({1 : 1, # bad\n",
    "                                                       2 : 1, 3 : 1, # average\n",
    "                                                       4 : 2, 5 : 2 # good\n",
    "                                                      },inplace = True)\n",
    "    return df\n",
    "\n",
    "train = simplify(train)\n",
    "test = simplify(test)\n",
    "# train.head()\n",
    "# print(train.SimplOverallQual)\n",
    "print(train.OverallQual)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2* Combinations of existing features\n",
    "def Combine(df):\n",
    "    # Overall quality of the house\n",
    "    df[\"OverallGrade\"] = df[\"OverallQual\"] * df[\"OverallCond\"]\n",
    "    # Overall quality of the garage\n",
    "    df[\"GarageGrade\"] = df[\"GarageQual\"] * df[\"GarageCond\"]\n",
    "    # Overall quality of the exterior\n",
    "    df[\"ExterGrade\"] = df[\"ExterQual\"] * df[\"ExterCond\"]\n",
    "    # Overall kitchen score\n",
    "    df[\"KitchenScore\"] = df[\"KitchenAbvGr\"] * df[\"KitchenQual\"]\n",
    "    # Overall fireplace score\n",
    "    df[\"FireplaceScore\"] = df[\"Fireplaces\"] * df[\"FireplaceQu\"]\n",
    "    # Overall garage score\n",
    "    df[\"GarageScore\"] = df[\"GarageArea\"] * df[\"GarageQual\"]\n",
    "    # Overall pool score\n",
    "    df[\"PoolScore\"] = df[\"PoolArea\"] * df[\"PoolQC\"]\n",
    "    # Simplified overall quality of the house\n",
    "    df[\"SimplOverallGrade\"] = df[\"SimplOverallQual\"] * df[\"SimplOverallCond\"]\n",
    "    # Simplified overall quality of the exterior\n",
    "    df[\"SimplExterGrade\"] = df[\"SimplExterQual\"] * df[\"SimplExterCond\"]\n",
    "    # Simplified overall pool score\n",
    "    df[\"SimplPoolScore\"] = df[\"PoolArea\"] * df[\"SimplPoolQC\"]\n",
    "    # Simplified overall garage score\n",
    "    df[\"SimplGarageScore\"] = df[\"GarageArea\"] * df[\"SimplGarageQual\"]\n",
    "    # Simplified overall fireplace score\n",
    "    df[\"SimplFireplaceScore\"] = df[\"Fireplaces\"] * df[\"SimplFireplaceQu\"]\n",
    "    # Simplified overall kitchen score\n",
    "    df[\"SimplKitchenScore\"] = df[\"KitchenAbvGr\"] * df[\"SimplKitchenQual\"]\n",
    "    # Total number of bathrooms\n",
    "    df[\"TotalBath\"] = df[\"BsmtFullBath\"] + (0.5 * df[\"BsmtHalfBath\"]) + \\\n",
    "    df[\"FullBath\"] + (0.5 * df[\"HalfBath\"])\n",
    "    # Total SF for house (incl. basement)\n",
    "    df[\"AllSF\"] = df[\"GrLivArea\"] + df[\"TotalBsmtSF\"]\n",
    "    # Total SF for 1st + 2nd floors\n",
    "    df[\"AllFlrsSF\"] = df[\"1stFlrSF\"] + df[\"2ndFlrSF\"]\n",
    "    # Total SF for porch\n",
    "    df[\"AllPorchSF\"] = df[\"OpenPorchSF\"] + df[\"EnclosedPorch\"] + \\\n",
    "    df[\"3SsnPorch\"] + df[\"ScreenPorch\"]\n",
    "    # Has masonry veneer or not\n",
    "    df[\"HasMasVnr\"] = df.MasVnrType.replace({\"BrkCmn\" : 1, \"BrkFace\" : 1, \"CBlock\" : 1, \n",
    "                                                   \"Stone\" : 1, \"None\" : 0})\n",
    "    # House completed before sale or not\n",
    "    df[\"BoughtOffPlan\"] = df.SaleCondition.replace({\"Abnorml\" : 0, \"Alloca\" : 0, \"AdjLand\" : 0, \n",
    "                                                          \"Family\" : 0, \"Normal\" : 0, \"Partial\" : 1})\n",
    "    \n",
    "    return df\n",
    "\n",
    "#对训练集和测试集分别进行编码\n",
    "train = Combine(train)\n",
    "test = Combine(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>MSSubClass</th>\n",
       "      <th>MSZoning</th>\n",
       "      <th>LotFrontage</th>\n",
       "      <th>LotArea</th>\n",
       "      <th>Street</th>\n",
       "      <th>Alley</th>\n",
       "      <th>LotShape</th>\n",
       "      <th>LandContour</th>\n",
       "      <th>Utilities</th>\n",
       "      <th>LotConfig</th>\n",
       "      <th>...</th>\n",
       "      <th>SimplPoolScore</th>\n",
       "      <th>SimplGarageScore</th>\n",
       "      <th>SimplFireplaceScore</th>\n",
       "      <th>SimplKitchenScore</th>\n",
       "      <th>TotalBath</th>\n",
       "      <th>AllSF</th>\n",
       "      <th>AllFlrsSF</th>\n",
       "      <th>AllPorchSF</th>\n",
       "      <th>HasMasVnr</th>\n",
       "      <th>BoughtOffPlan</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>65.0</td>\n",
       "      <td>8450</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.5</td>\n",
       "      <td>2566</td>\n",
       "      <td>1710</td>\n",
       "      <td>61</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>SC20</td>\n",
       "      <td>RL</td>\n",
       "      <td>80.0</td>\n",
       "      <td>9600</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>4</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.5</td>\n",
       "      <td>2524</td>\n",
       "      <td>1262</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>68.0</td>\n",
       "      <td>11250</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Inside</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.5</td>\n",
       "      <td>2706</td>\n",
       "      <td>1786</td>\n",
       "      <td>42</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>SC70</td>\n",
       "      <td>RL</td>\n",
       "      <td>60.0</td>\n",
       "      <td>9550</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>Corner</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2.0</td>\n",
       "      <td>2473</td>\n",
       "      <td>1717</td>\n",
       "      <td>307</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>SC60</td>\n",
       "      <td>RL</td>\n",
       "      <td>84.0</td>\n",
       "      <td>14260</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>Lvl</td>\n",
       "      <td>4</td>\n",
       "      <td>FR2</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3343</td>\n",
       "      <td>2198</td>\n",
       "      <td>84</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 114 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "  MSSubClass MSZoning  LotFrontage  LotArea  Street  Alley  LotShape  \\\n",
       "0       SC60       RL         65.0     8450       2      0         4   \n",
       "1       SC20       RL         80.0     9600       2      0         4   \n",
       "2       SC60       RL         68.0    11250       2      0         3   \n",
       "3       SC70       RL         60.0     9550       2      0         3   \n",
       "4       SC60       RL         84.0    14260       2      0         3   \n",
       "\n",
       "  LandContour  Utilities LotConfig      ...        SimplPoolScore  \\\n",
       "0         Lvl          4    Inside      ...                   NaN   \n",
       "1         Lvl          4       FR2      ...                   NaN   \n",
       "2         Lvl          4    Inside      ...                   NaN   \n",
       "3         Lvl          4    Corner      ...                   NaN   \n",
       "4         Lvl          4       FR2      ...                   NaN   \n",
       "\n",
       "  SimplGarageScore SimplFireplaceScore SimplKitchenScore TotalBath AllSF  \\\n",
       "0              NaN                 NaN               NaN       3.5  2566   \n",
       "1              NaN                 NaN               NaN       2.5  2524   \n",
       "2              NaN                 NaN               NaN       3.5  2706   \n",
       "3              NaN                 NaN               NaN       2.0  2473   \n",
       "4              NaN                 NaN               NaN       3.5  3343   \n",
       "\n",
       "   AllFlrsSF  AllPorchSF  HasMasVnr  BoughtOffPlan  \n",
       "0       1710          61          1              0  \n",
       "1       1262           0          0              0  \n",
       "2       1786          42          1              0  \n",
       "3       1717         307          0              0  \n",
       "4       2198          84          1              0  \n",
       "\n",
       "[5 rows x 114 columns]"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Find most important features relative to target\n",
      "0.616027698277\n"
     ]
    }
   ],
   "source": [
    "print('Find most important features relative to target')\n",
    "corr = train.corr()\n",
    "corr.sort_values(['SalePrice'], ascending=False, inplace=True)\n",
    "threshold = corr.SalePrice.iloc[11]\n",
    "print(threshold)\n",
    "top10_cols = (corr.SalePrice[corr['SalePrice']>threshold]).axes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "MSSubClass        0\n",
       "MSZoning          0\n",
       "LotFrontage       0\n",
       "LotArea           0\n",
       "Street            0\n",
       "Alley             0\n",
       "LotShape          0\n",
       "LandContour       0\n",
       "Utilities         0\n",
       "LotConfig         0\n",
       "LandSlope         0\n",
       "Neighborhood      0\n",
       "Condition1        0\n",
       "Condition2        0\n",
       "BldgType          0\n",
       "HouseStyle        0\n",
       "OverallQual       0\n",
       "OverallCond       0\n",
       "YearBuilt         0\n",
       "YearRemodAdd      0\n",
       "RoofStyle         0\n",
       "RoofMatl          0\n",
       "Exterior1st       0\n",
       "Exterior2nd       0\n",
       "MasVnrType        0\n",
       "MasVnrArea        0\n",
       "ExterQual         0\n",
       "ExterCond         0\n",
       "Foundation        0\n",
       "BsmtQual          0\n",
       "                 ..\n",
       "AllSF_s2          0\n",
       "AllSF_s3          0\n",
       "AllSF_sq          0\n",
       "AllFlrsSF_s2      0\n",
       "AllFlrsSF_s3      0\n",
       "AllFlrsSF_sq      0\n",
       "GrLivArea_s2      0\n",
       "GrLivArea_s3      0\n",
       "GrLivArea_sq      0\n",
       "OverallQual_s2    0\n",
       "OverallQual_s3    0\n",
       "OverallQual_sq    0\n",
       "GarageCars_s2     0\n",
       "GarageCars_s3     0\n",
       "GarageCars_sq     0\n",
       "TotalBsmtSF_s2    0\n",
       "TotalBsmtSF_s3    0\n",
       "TotalBsmtSF_sq    0\n",
       "GarageArea_s2     0\n",
       "GarageArea_s3     0\n",
       "GarageArea_sq     0\n",
       "TotalBath_s2      0\n",
       "TotalBath_s3      0\n",
       "TotalBath_sq      0\n",
       "ExterQual_s2      0\n",
       "ExterQual_s3      0\n",
       "ExterQual_sq      0\n",
       "1stFlrSF_s2       0\n",
       "1stFlrSF_s3       0\n",
       "1stFlrSF_sq       0\n",
       "Length: 144, dtype: int64"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def Polynominals_top10(df, top10_cols):\n",
    "    for i in range(1, 11):\n",
    "        new_cols_2 = top10_cols[0][i] + '_s' + str(2)\n",
    "        new_cols_3 = top10_cols[0][i] + '_s' + str(3)\n",
    "        new_cols_sq = top10_cols[0][i] + '_sq'\n",
    "        \n",
    "        df[new_cols_2] = df[top10_cols[0][i]] ** 2\n",
    "        df[new_cols_3] = df[top10_cols[0][i]] ** 3\n",
    "        df[new_cols_sq] = np.sqrt(df[top10_cols[0][i]])\n",
    "    return df\n",
    "\n",
    "train = Polynominals_top10(train, top10_cols)\n",
    "test = Polynominals_top10(test, top10_cols)\n",
    "train.head()\n",
    "train.isnull().sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Numberical features: 97\n",
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Columns: 144 entries, MSSubClass to 1stFlrSF_sq\n",
      "dtypes: float64(16), int64(82), object(46)\n",
      "memory usage: 1.6+ MB\n",
      "NAs for numerical features in df : 81\n",
      "Remaining NAs for numerical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "from sklearn.preprocessing import StandardScaler\n",
    "def fillna_numerical_train(df):\n",
    "    numerical_features = df.select_dtypes(exclude = ['object']).columns\n",
    "    numerical_features = numerical_features.drop('SalePrice')\n",
    "    print('Numberical features: ' + str(len(numerical_features)))\n",
    "    df.info()\n",
    "    df_num = df[numerical_features]\n",
    "    medians = df_num.median()\n",
    "    print(\"NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "    df_num = df_num.fillna(medians)\n",
    "    print(\"Remaining NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "    #df_num.info()\n",
    "    # 分别初始化对特征和目标值的标准化器\n",
    "    ss_X = StandardScaler()\n",
    "\n",
    "    # 对训练特征进行标准化处理\n",
    "    temp = ss_X.fit_transform(df_num)\n",
    "    df_num = pd.DataFrame(data=temp, columns=numerical_features, index =df_num.index)\n",
    "    \n",
    "    return df_num, medians, ss_X\n",
    "train_num, medians, ss_X = fillna_numerical_train(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Data columns (total 97 columns):\n",
      "LotFrontage       1456 non-null float64\n",
      "LotArea           1456 non-null float64\n",
      "Street            1456 non-null float64\n",
      "Alley             1456 non-null float64\n",
      "LotShape          1456 non-null float64\n",
      "Utilities         1456 non-null float64\n",
      "LandSlope         1456 non-null float64\n",
      "OverallQual       1456 non-null float64\n",
      "OverallCond       1456 non-null float64\n",
      "YearBuilt         1456 non-null float64\n",
      "YearRemodAdd      1456 non-null float64\n",
      "MasVnrArea        1456 non-null float64\n",
      "ExterQual         1456 non-null float64\n",
      "ExterCond         1456 non-null float64\n",
      "BsmtQual          1456 non-null float64\n",
      "BsmtCond          1456 non-null float64\n",
      "BsmtExposure      1456 non-null float64\n",
      "BsmtFinType1      1456 non-null float64\n",
      "BsmtFinSF1        1456 non-null float64\n",
      "BsmtFinType2      1456 non-null float64\n",
      "BsmtFinSF2        1456 non-null float64\n",
      "BsmtUnfSF         1456 non-null float64\n",
      "TotalBsmtSF       1456 non-null float64\n",
      "HeatingQC         1456 non-null float64\n",
      "1stFlrSF          1456 non-null float64\n",
      "2ndFlrSF          1456 non-null float64\n",
      "LowQualFinSF      1456 non-null float64\n",
      "GrLivArea         1456 non-null float64\n",
      "BsmtFullBath      1456 non-null float64\n",
      "BsmtHalfBath      1456 non-null float64\n",
      "FullBath          1456 non-null float64\n",
      "HalfBath          1456 non-null float64\n",
      "BedroomAbvGr      1456 non-null float64\n",
      "KitchenAbvGr      1456 non-null float64\n",
      "KitchenQual       1456 non-null float64\n",
      "TotRmsAbvGrd      1456 non-null float64\n",
      "Functional        1456 non-null float64\n",
      "Fireplaces        1456 non-null float64\n",
      "FireplaceQu       1456 non-null float64\n",
      "GarageYrBlt       1456 non-null float64\n",
      "GarageCars        1456 non-null float64\n",
      "GarageArea        1456 non-null float64\n",
      "GarageQual        1456 non-null float64\n",
      "GarageCond        1456 non-null float64\n",
      "PavedDrive        1456 non-null float64\n",
      "WoodDeckSF        1456 non-null float64\n",
      "OpenPorchSF       1456 non-null float64\n",
      "EnclosedPorch     1456 non-null float64\n",
      "3SsnPorch         1456 non-null float64\n",
      "ScreenPorch       1456 non-null float64\n",
      "PoolArea          1456 non-null float64\n",
      "PoolQC            1456 non-null float64\n",
      "MiscVal           1456 non-null float64\n",
      "YrSold            1456 non-null float64\n",
      "OverallGrade      1456 non-null float64\n",
      "GarageGrade       1456 non-null float64\n",
      "ExterGrade        1456 non-null float64\n",
      "KitchenScore      1456 non-null float64\n",
      "FireplaceScore    1456 non-null float64\n",
      "GarageScore       1456 non-null float64\n",
      "PoolScore         1456 non-null float64\n",
      "TotalBath         1456 non-null float64\n",
      "AllSF             1456 non-null float64\n",
      "AllFlrsSF         1456 non-null float64\n",
      "AllPorchSF        1456 non-null float64\n",
      "HasMasVnr         1456 non-null float64\n",
      "BoughtOffPlan     1456 non-null float64\n",
      "AllSF_s2          1456 non-null float64\n",
      "AllSF_s3          1456 non-null float64\n",
      "AllSF_sq          1456 non-null float64\n",
      "AllFlrsSF_s2      1456 non-null float64\n",
      "AllFlrsSF_s3      1456 non-null float64\n",
      "AllFlrsSF_sq      1456 non-null float64\n",
      "GrLivArea_s2      1456 non-null float64\n",
      "GrLivArea_s3      1456 non-null float64\n",
      "GrLivArea_sq      1456 non-null float64\n",
      "OverallQual_s2    1456 non-null float64\n",
      "OverallQual_s3    1456 non-null float64\n",
      "OverallQual_sq    1456 non-null float64\n",
      "GarageCars_s2     1456 non-null float64\n",
      "GarageCars_s3     1456 non-null float64\n",
      "GarageCars_sq     1456 non-null float64\n",
      "TotalBsmtSF_s2    1456 non-null float64\n",
      "TotalBsmtSF_s3    1456 non-null float64\n",
      "TotalBsmtSF_sq    1456 non-null float64\n",
      "GarageArea_s2     1456 non-null float64\n",
      "GarageArea_s3     1456 non-null float64\n",
      "GarageArea_sq     1456 non-null float64\n",
      "TotalBath_s2      1456 non-null float64\n",
      "TotalBath_s3      1456 non-null float64\n",
      "TotalBath_sq      1456 non-null float64\n",
      "ExterQual_s2      1456 non-null float64\n",
      "ExterQual_s3      1456 non-null float64\n",
      "ExterQual_sq      1456 non-null float64\n",
      "1stFlrSF_s2       1456 non-null float64\n",
      "1stFlrSF_s3       1456 non-null float64\n",
      "1stFlrSF_sq       1456 non-null float64\n",
      "dtypes: float64(97)\n",
      "memory usage: 1.1 MB\n"
     ]
    }
   ],
   "source": [
    "train_num.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Numerical features : 97\n",
      "NAs for numerical features in df : 88\n",
      "Remaining NAs for numerical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "def fillna_numerical_test(df, medians, ss_X):\n",
    "    numerical_features = df.select_dtypes(exclude = [\"object\"]).columns\n",
    "    #numerical_features = numerical_features.drop(\"SalePrice\")  #测试集中没有SalePrice\n",
    "    print(\"Numerical features : \" + str(len(numerical_features)))\n",
    "\n",
    "    df_num = df[numerical_features]\n",
    "    \n",
    "    # Handle remaining missing values for numerical features by using median as replacement\n",
    "    print(\"NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "    df_num = df_num.fillna(medians)\n",
    "    print(\"Remaining NAs for numerical features in df : \" + str(df_num.isnull().values.sum()))\n",
    "\n",
    "    #对数值特征进行标准化\n",
    "    temp = ss_X.transform(df_num)\n",
    "    df_num = pd.DataFrame(data=temp, columns=numerical_features, index =df_num.index )\n",
    "    return df_num\n",
    "\n",
    "test_num = fillna_numerical_test(test, medians, ss_X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Categorical features : 46\n",
      "NAs for categorical features in df : 61223\n",
      "Remaining NAs for categorical features in df : 0\n"
     ]
    }
   ],
   "source": [
    "def get_dummies_cat(df):\n",
    "    categorical_features = df.select_dtypes(include = [\"object\"]).columns\n",
    "    print(\"Categorical features : \" + str(len(categorical_features)))\n",
    "    df_cat = df[categorical_features]\n",
    "    \n",
    "\n",
    "    # Create dummy features for categorical values via one-hot encoding\n",
    "    print(\"NAs for categorical features in df : \" + str(df_cat.isnull().values.sum()))\n",
    "    df_cat = pd.get_dummies(df_cat,dummy_na=True)\n",
    "    print(\"Remaining NAs for categorical features in df : \" + str(df_cat.isnull().values.sum()))\n",
    "    \n",
    "    return df_cat\n",
    "\n",
    "#必须考虑类别型特征的取值范围（训练集和测试的取值范围可能不同）\n",
    "#train_cat = get_dummies_cat(train)\n",
    "#test_cat = get_dummies_cat(test)\n",
    "\n",
    "n_train_samples = train.shape[0]  \n",
    "train_test = pd.concat((train, test), axis=0)\n",
    "train_test_cat = get_dummies_cat(train_test)\n",
    "   \n",
    "train_cat = train_test_cat.iloc[:n_train_samples, :]\n",
    "test_cat = train_test_cat.iloc[n_train_samples:, :]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Columns: 246 entries, BldgType_1Fam to SimplPoolScore_nan\n",
      "dtypes: uint8(246)\n",
      "memory usage: 361.2 KB\n"
     ]
    }
   ],
   "source": [
    "train_cat.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "New number of features : 343\n",
      "New number of features : 343\n"
     ]
    }
   ],
   "source": [
    "def joint_num_cat(df_num, df_cat):\n",
    "    df = pd.concat([df_num, df_cat], axis = 1, ignore_index=True)\n",
    "    print(\"New number of features : \" + str(df.shape[1]))\n",
    "    \n",
    "    return df\n",
    "\n",
    "FE_train = joint_num_cat(train_num, train_cat)\n",
    "FE_test = joint_num_cat(test_num, test_cat)\n",
    "\n",
    "FE_train = pd.concat([FE_train, train['SalePrice']], axis = 1)\n",
    "FE_test = pd.concat([test_id,FE_test], axis = 1)\n",
    "\n",
    "FE_train.to_csv('AmesHouse_FE_train.csv', index=False)\n",
    "FE_test.to_csv('AmesHouse_FE_test.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 1456 entries, 0 to 1459\n",
      "Columns: 344 entries, 0 to SalePrice\n",
      "dtypes: float64(97), int64(1), uint8(246)\n",
      "memory usage: 1.4 MB\n"
     ]
    }
   ],
   "source": [
    "FE_train.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
